#!/usr/bin/perl
#-*-perl-*-
#

=head1 NAME

opus2moses

=head1 SYNOPSIS

 # convert sentence aligned bitexts to factored moses input
 # (requires XML::Parser)

 opus2moses [OPTIONS] < sentence-align-file.xml

=head1 OPTIONS:

 -s srcfactors ......... specify source language factors besides surface words
 -t trgfactors ......... the same for the target language (separated by ':')
                         factors should be attributes of <w> tags!!
                         (except 'word' which is the word itself)
 -d dir ................ home directory of the OPUS subcorpus
 -n file-pattern ....... skip bitext files that match pattern (e.g. ep-00-1*)
 -i .................... inverse selection (only files matching file pattern)
 -e src-data-file ...... output file for source language data (default = src)
 -f src-data-file ...... output file for target language data (default = trg)

 -b xces-align-file .... print extracted pairs to am XCES ALIGN file
 -p sentence-pair-file . stores sentence ID pairs of the extracted pairs
 -c threshold .......... link score threshold (default attribute = certainty)
 -C link-score-attr .... name of the attribute that stores the link score
 -u src-langid ......... check source language ID (require langid markup)
 -U src-langid-conf .... confidence threshold for source langid
 -w trg-langid ......... check target language ID (require langid markup)
 -W trg-langid-conf .... confidence threshold for target langid
 -l .................... convert to lower case
 -1 .................... 1:1 links only
 -x max ................ max size of sentences (in nr of words)

 -r .................... process untokenized (raw) XML (no length filtering)

 -S .................... read sentences sequentially (requires monotonic links)
                         (some sentence pairs may be missing if this fails!)

=head1 OBSOLETE:

The following option is now obsolete because the option for reading all sentences
into memory is now the default. Switch this off by using -S.

 -M .................... read all sentences into memory for an aligned document
                         before extracting linked sentences (for non-monotonic links)


=head1 LICENSE

 ---------------------------------------------------------------------------
 Copyright (c) 2004-2019 Joerg Tiedemann

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ---------------------------------------------------------------------------

=head1 See also

This script is part of opus-tools L<https://github.com/Helsinki-NLP/opus-tools>
See also OPUS L<http://opus.nlpl.eu> for more information about the corpus.

=cut


use strict;
use XML::Parser;
use FileHandle;
use File::Basename;


use vars qw($opt_1 $opt_b $opt_c $opt_C $opt_d $opt_e $opt_f $opt_h 
            $opt_i $opt_l $opt_m $opt_M $opt_n $opt_p $opt_r 
            $opt_s $opt_S $opt_t $opt_u $opt_U $opt_v $opt_w $opt_W $opt_x);
use Getopt::Std;

getopts('b:c:C:s:t:d:n:ie:f:hp:l1x:u:U:vw:W:rmMS');

## -M is now on by default! -S can switch it off
## Why? sequential reading is not very reliable
## --> better read all sentences into memory before
##     printing aligned sentences!
$opt_M = 1 unless ($opt_S);

if ($opt_h){
    print <<"EOH";

 opus2moses [OPTIONS] < sentence-align-file.xml

 convert sentence aligned bitexts to factored moses input
 (requires XML::Parser)

 OPTIONS:

 -s srcfactors ......... specify source language factors besides surface words
 -t trgfactors ......... the same for the target language (separated by ':')
                         factors should be attributes of <w> tags!!
                         (except 'word' which is the word itself)
 -d dir ................ home directory of the OPUS subcorpus
 -n file-pattern ....... skip bitext files that match pattern (e.g. ep-00-1*)
 -i .................... inverse selection (only files matching file pattern)
 -e src-data-file ...... output file for source language data (default = src)
 -f src-data-file ...... output file for target language data (default = trg)
 -c threshold .......... link score threshold (default attribute = certainty)
 -C link-score-attr .... name of the attribute that stores the link score
 -u src-langid ......... check source language ID (require langid markup)
 -U src-langid-conf .... confidence threshold for source langid
 -w trg-langid ......... check target language ID (require langid markup)
 -W trg-langid-conf .... confidence threshold for target langid
 -1 .................... extract 1:1 sentence alignments only
 -x max ................ maximum size of selected sentences (default=100)
 -r .................... process untokenized (raw) XML (no length filtering)
 -S .................... read sentences sequentially (requires monotonic links)
 -b xces-align-file .... print extracted pairs to am XCES ALIGN file

EOH
    exit;
}

binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");


my $CORPUSHOME    = $opt_d || '.';
my $SRCFACTORSTR  = $opt_s || "word";
my $TRGFACTORSTR  = $opt_t || "word";

# my $SRCFACTORSTR = $opt_s || "lem:tree";
# my $TRGFACTORSTR = $opt_t || "lem:tree";

my $SRCOUTFILE   = $opt_e || 'src';
my $TRGOUTFILE   = $opt_f || 'trg';

my @SrcFactors = split(/:/,$SRCFACTORSTR);
my @TrgFactors = split(/:/,$TRGFACTORSTR);

my $MAX = $opt_x || 100;
my $LINKSCOREATTR = $opt_C || 'certainty';

## make XML parser object for parsing the sentence alignment file

my $BitextParser = new XML::Parser(Handlers => {Start => \&AlignTagStart,
						End => \&AlignTagEnd});
my $BitextHandler = $BitextParser->parse_start;

## global variables for the source and target language XML parsers

my ($SrcParser,$TrgParser);
my ($SrcHandler,$TrgHandler);

my ($SRC,$TRG);         # filehandles for reading bitexts

## open output files

my $SRCOUT = new FileHandle;
my $TRGOUT = new FileHandle;

unless ($opt_m){
    $SRCOUT->open("> $SRCOUTFILE");
    $TRGOUT->open("> $TRGOUTFILE");

    binmode($SRCOUT, ":utf8");
    binmode($TRGOUT, ":utf8");
}


if ($opt_p){
    open P,">$opt_p" || warn "cannot open $opt_p ...\n";
}
my $openLinkGrp=0;
if ($opt_b){
    open B,">$opt_b" || warn "cannot open $opt_b ...\n";
    print B '<?xml version="1.0" encoding="utf-8"?>'."\n";
    print B '<!DOCTYPE cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">'."\n";
    print B '<cesAlign version="1.0">'."\n";
}



my $ALIGNHOME = '.';
$ALIGNHOME = dirname($ARGV[0]) if (@ARGV);

## use '>' as input delimiter when reading (usually end of XML tag)

$/='>';

## read through sentence alignment file and parse XML
## - sub routines for reading source and target language corpora are
##   called from XML handlers connected to this XML parser object
## - aligned sentences have to be in the same order in the corpus files
##   as they appear in the sentence alignment file!

while (<>){
    ## fix un-escaped XML entities
    s/&(?!(#\d+|\w+);)/&amp;/gs;
    eval { $BitextHandler->parse_more($_); };
    if ($@){
	print STDERR $_;
	die $@;
    }
}

$SRCOUT->close();
$TRGOUT->close();

if ($opt_p){
    close P;
}
if ($opt_b){
    print B "  </linkGrp>\n" if ($openLinkGrp);
    print B "</cesAlign>\n" if ($openLinkGrp);
    close B;
}



## finished!
##--------------------------------------------------------------------------




## open source and target corpus files (could be gzipped)
## create new XML parser objects and start parsing

sub OpenCorpora{
    my ($srcfile,$trgfile)=@_;

    # the file may be zipped and the path is wrong in the alignfile
    if ((! -e "$CORPUSHOME/$srcfile") && (-e "$CORPUSHOME/$srcfile.gz")){
	$srcfile.='.gz';
    }
    if ((! -e "$CORPUSHOME/$trgfile") && (-e "$CORPUSHOME/$trgfile.gz")){
	$trgfile.='.gz';
    }

    # try ALIGNHOME
    if (! -e "$CORPUSHOME/$srcfile"){
	if (-e "$ALIGNHOME/$srcfile"){
	    $CORPUSHOME = $ALIGNHOME;
	}
	elsif (-e "$ALIGNHOME/$srcfile.gz"){
	    $CORPUSHOME = $ALIGNHOME;
	    $srcfile.='.gz';
	}
    }
    if (! -e "$CORPUSHOME/$trgfile"){
	if (-e "$ALIGNHOME/$trgfile"){
	    $CORPUSHOME = $ALIGNHOME;
	}
	elsif (-e "$ALIGNHOME/$trgfile.gz"){
	    $CORPUSHOME = $ALIGNHOME;
	    $trgfile.='.gz';
	}
    }

    # try to remove the xml subdir
    if ((! -e "$CORPUSHOME/$srcfile") && ($srcfile=~/xml\//)){	
	$srcfile=~s/xml\///;
	return OpenCorpora($srcfile,$trgfile);
    }
    if ((! -e "$CORPUSHOME/$trgfile") && ($trgfile=~/xml\//)){	
	$trgfile=~s/xml\///;
	return OpenCorpora($srcfile,$trgfile);
    }

    ## check if file names match pattern of files to be skipped
    if (defined $opt_n){
	if ($opt_i){
	    if ($srcfile!~/$opt_n/ && $trgfile!~/$opt_n/){
		print "skip $srcfile-$trgfile\n";
		return 0;
	    }
	}
	elsif ($srcfile=~/$opt_n/ || $trgfile=~/$opt_n/){
	    print "skip $srcfile-$trgfile\n";
	    return 0;
	}
    }

    return 0 unless (-e "$CORPUSHOME/$srcfile");
    return 0 unless (-e "$CORPUSHOME/$trgfile");

    print STDERR "open bitext $srcfile <-> $trgfile\n";

    ## open filehandles to read from
    $SRC = new FileHandle;
    $TRG = new FileHandle;
    if ($srcfile=~/\.gz$/){$SRC->open("gzip -cd < $CORPUSHOME/$srcfile |");}
    else{$SRC->open("< $CORPUSHOME/$srcfile");}
    if ($trgfile=~/\.gz$/){$TRG->open("gzip -cd < $CORPUSHOME/$trgfile |");}
    else{$TRG->open("< $CORPUSHOME/$trgfile");}

    $SrcParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
					      End => \&XmlTagEnd,
					      Default => \&XmlChar});
    $SrcHandler = $SrcParser->parse_start;
    $SrcHandler->{OUT} = $SRCOUT;
    $SrcHandler->{LANGID} = $opt_u;
    $SrcHandler->{LANGID_THR} = $opt_U;
    @{$SrcHandler->{FACTORS}} = @SrcFactors;
    $TrgParser = new XML::Parser(Handlers => {Start => \&XmlTagStart,
					      End => \&XmlTagEnd,
					      Default => \&XmlChar});
    $TrgHandler = $TrgParser->parse_start;
    $TrgHandler->{OUT} = $TRGOUT;
    $TrgHandler->{LANGID} = $opt_w;
    $TrgHandler->{LANGID_THR} = $opt_W;
    @{$TrgHandler->{FACTORS}} = @TrgFactors;

    # multiple file output (option -m): create new output files for each
    # aligned document (use file basenames and the current directory)
    # (use opt_e/opt_f as extension or 'src', 'trg' as default)
    if ($opt_m){

	# my $srcout = basename($srcfile);
	##  my $trgout = basename($trgfile);
	# my $trgout = basename($srcfile);

        my @path = split(/\/+/,$srcfile);
        shift(@path);
        my $srcout = join('_',@path);
        my $trgout = join('_',@path);

	$srcout =~s/\.xml(\.gz)?/.$SRCOUTFILE/;
	$trgout =~s/\.xml(\.gz)?/.$TRGOUTFILE/;

	$SRCOUT->open("> $srcout");
	$TRGOUT->open("> $trgout");
	binmode($SRCOUT, ":utf8");
	binmode($TRGOUT, ":utf8");
    }

}

sub CloseCorpora{
    $SRC->close() if (defined $SRC);
    $TRG->close() if (defined $TRG);
}



## read from file handle and parse XML with corpus parser object
## idstr should be space-delimitered string of sentence IDs
## read until all sentences (all IDs) are found!
## --> all sentences have to exist and have to appear in the same order
##     they are specified in idstr (no crossing links whatsoever!)

sub ParseSentences{
    my ($idstr,$handle,$fh)=@_;
    my @ids=split(/\s+/,$idstr);
    @{$handle->{IDS}}=@ids;

    return if not @{$handle->{IDS}};

    while (<$fh>){
	eval { $handle->parse_more($_); };
	if ($@){
	    warn $@;
	    print STDERR $_;
	}
	return 1 if ($handle->{CLOSEDSID} eq $ids[-1]);
    }
}


## parse all sentences and store them in a container
## - this is useful if the sentences are scrambled and IDs don't appear in monotonic order
##   in the same way they appear in the sentence alignment file

sub ParseAllSentences{
    my ($handle,$fh,$container,$wordcounter)=@_;

    while (<$fh>){
	eval { $handle->parse_more($_); };
	if ($@){
	    warn $@;
	    print STDERR $_;
	}
	if (exists $handle->{CLOSEDSID}){
	    $$container{$handle->{CLOSEDSID}} = $handle->{OUTSTR};
	    $$wordcounter{$handle->{CLOSEDSID}} = $handle->{NRWORDS};
	    $handle->{OUTSTR} = '';
	    $handle->{NRWORDS} = 0;
	    delete $handle->{CLOSEDSID};
	}
    }
    print '';
}



##-------------------------------------------------------------------------
## XML parser handlers for sentence alignment parser


## XML opening tags
## - linkGrp --> open a new bitext (source & target corpus file)
## - link    --> a new sentence alignment: read sentences from source & target

sub AlignTagStart{
    my ($p,$e,%a)=@_;


    if ($e eq 'linkGrp'){
	if ($opt_p){
	    print P "## $a{fromDoc}\t$a{toDoc}\n";
	}
	if ($opt_b){
	    print B "  </linkGrp>\n" if ($openLinkGrp);
	    print B "  <linkGrp targType=\"s\" fromDoc=\"$a{fromDoc}\" toDoc=\"$a{toDoc}\">\n";
	    $openLinkGrp=1;
	}
	my $success = &OpenCorpora($a{fromDoc},$a{toDoc});

	## optional: parse the whole source and target documents
	##           and store all data
	if ($opt_M){
	    ## data containers 
	    %{$$p{SrcSents}}=();
	    %{$$p{TrgSents}}=();
	    %{$$p{SrcWordCount}}=();
	    %{$$p{TrgWordCount}}=();
	    &ParseAllSentences($SrcHandler,$SRC,$$p{SrcSents},$$p{SrcWordCount});
	    &ParseAllSentences($TrgHandler,$TRG,$$p{TrgSents},$$p{TrgWordCount});
	}
	return $success;
    }

    if ($e eq 'link'){
	## only if bitext filehandles are defined ...
	if (defined $SRC && defined $TRG){

	    ## if there is a link score threshold
	    ## --> filter out links that are below the threshold
	    if (defined $opt_c){
		return if ( $a{$LINKSCOREATTR}<$opt_c );
	    }

	    my ($src,$trg) = split(/\s*\;\s*/,$a{xtargets});
	    if ($src=~/\S/ && $trg=~/\S/){
		if ($opt_1){
		    return if ($src=~/\S\s\S/);
		    return if ($trg=~/\S\s\S/);
		}

		$SrcHandler->{OUTSTR}='';   # reset output string
		$TrgHandler->{OUTSTR}='';

		## (a) retrieve sentences from memory
		if ($opt_M){
		    $SrcHandler->{NRWORDS} = 0;
		    $TrgHandler->{NRWORDS} = 0;
		    my @ids=split(/\s+/,$src);
		    foreach (@ids){
			$SrcHandler->{OUTSTR} .= $$p{SrcSents}{$_}.' ';
			$SrcHandler->{NRWORDS} += $$p{SrcWordCount}{$_};
		    }
		    @ids=split(/\s+/,$trg);
		    foreach (@ids){
			$TrgHandler->{OUTSTR} .= $$p{TrgSents}{$_}.' ';
			$TrgHandler->{NRWORDS} += $$p{TrgWordCount}{$_};
		    }
		    $SrcHandler->{OUTSTR}-~s/\s*$//;
		    $TrgHandler->{OUTSTR}-~s/\s*$//;
		}

		## (b) read sentences from file
		else{
		    &ParseSentences($src,$SrcHandler,$SRC);
		    &ParseSentences($trg,$TrgHandler,$TRG);
		}

		unless ($opt_r){
		    # skip if no words found
		    return if (not $SrcHandler->{NRWORDS});
		    return if (not $TrgHandler->{NRWORDS});
		    # skip if sentences are too long
		    return if ($SrcHandler->{NRWORDS} > $MAX);
		    return if ($TrgHandler->{NRWORDS} > $MAX);
		    # skip if ratio<=8 (a bit more restrictive than clean_corpus)
		    return if ($SrcHandler->{NRWORDS}/$TrgHandler->{NRWORDS}>8);
		    return if ($TrgHandler->{NRWORDS}/$SrcHandler->{NRWORDS}>8);
#		    # skip if ratio<=9 (like in clean_corpus for MOSES)
#		   return if ($SrcHandler->{NRWORDS}/$TrgHandler->{NRWORDS}>9);
#		   return if ($TrgHandler->{NRWORDS}/$SrcHandler->{NRWORDS}>9);
		   # skip if one of the strings is empty (shouldn't happen!)
#		   return if ($SrcHandler->{OUTSTR}=~/^\s*$/);
#		   return if ($TrgHandler->{OUTSTR}=~/^\s*$/);
		}
		if ($SrcHandler->{OUTSTR}=~/^\s*$/){
		    unless ($opt_r){
			print STDERR "empty src string but $SrcHandler->{NRWORDS} words! --> strange ... (trg = $TrgHandler->{NRWORDS} words)\n";
		    }
		    return;
		}
		if ($TrgHandler->{OUTSTR}=~/^\s*$/){
		    unless ($opt_r){
			print STDERR "empty trg string but $TrgHandler->{NRWORDS} words! --> strange ... (src = $SrcHandler->{NRWORDS} words)\n";
		    }
		    return;
		}

		if ($opt_r){
		    $SrcHandler->{OUTSTR} = &normalize($SrcHandler->{OUTSTR});
		    $TrgHandler->{OUTSTR} = &normalize($TrgHandler->{OUTSTR});
		}
		else{
		    $SrcHandler->{OUTSTR}=~s/^\s+//;  # remove leading spaces
		    $SrcHandler->{OUTSTR}=~s/\s+$//;  # remove final spaces
		    $TrgHandler->{OUTSTR}=~s/^\s+//;
		    $TrgHandler->{OUTSTR}=~s/\s+$//;
		}

# 		my @srcwords=split(/\s+/,$SrcHandler->{OUTSTR});
# 		if (@srcwords!=$SrcHandler->{NRWORDS}){
# 		    die "string has different number of words than counted\n";
# 		}
# 		my @trgwords=split(/\s+/,$TrgHandler->{OUTSTR});
# 		if (@trgwords!=$TrgHandler->{NRWORDS}){
# 		    die "string has different number of words than counted\n";
# 		}

		print $SRCOUT $SrcHandler->{OUTSTR},"\n" if (defined $SRCOUT);
		print $TRGOUT $TrgHandler->{OUTSTR},"\n" if (defined $TRGOUT);

		if ($opt_p){
		    print P "$src\t$trg\n";
		}
		if ($opt_b){
		    print B "    <link xtargets=\"$src;$trg\" />\n";
		}
	    }
	}
    }
}


## closing tags: linkGrp --> close source and target corpus files

sub AlignTagEnd{
    my ($p,$e)=@_;
    if ($e eq 'linkGrp'){
	return &CloseCorpora();
    }
}



##-------------------------------------------------------------------------
## XML parser handlers for corpus parser (separate for source and target)


## XML opening tags
## - s: starts a new sentence --> store ID
## - w: starts a new word --> factors should be attributes of this tag!

sub XmlTagStart{
    my ($p,$e,%a)=@_;

    if ($e eq 's'){
	## check langid attribute if necessary
	if ($p->{LANGID}){
	    return unless ($a{langid} eq $p->{LANGID});
	    if ($p->{LANGID_THR}){
		return if ($a{langidconf} < $p->{LANGID_THR});
	    }
	}
	$p->{OPENSID} = $a{id};
	delete $p->{CLOSEDSID};
	$p->{NRWORDS}=0;
	return 1;
    }
    if ($e eq 'w'){
	if ($opt_M || ($p->{OPENSID} eq $p->{IDS}->[0])){
	    $p->{OPENW} = 1;
	    %{$p->{WATTR}} = %a;
	    $p->{WORD}='';
	    $p->{NRWORDS}++;
	}
    }
}

## strings within tags
## - open w-tag (or option -r = raw, untokenized text)? 
##   --> print string to SRCOUT

sub XmlChar{
    my ($p,$c)=@_;
    if (exists $p->{OPENSID}){
	if ($p->{OPENW} || $opt_r){
	    if ($opt_M || ($p->{OPENSID} eq $p->{IDS}->[0])){
#	    $c=~tr/| \n/___/;
		$p->{WATTR}->{word}.=$c;
		$p->{WATTR}->{word}.=' ' if ($opt_r);
	    }
	}
    }
}

## XML closing tags
## - s: end of sentence --> shift ID-set if necessary
## - w: end of word --> add space in the end (adds extra space after last word)

sub XmlTagEnd{
    my ($p,$e,%a)=@_;

    if ($e eq 's'){
	$p->{CLOSEDSID} = $p->{OPENSID};
	if ($p->{CLOSEDSID} eq $p->{IDS}->[0]){
	    shift(@{$p->{IDS}});
	}
	delete $p->{OPENSID};
	if ($opt_r){
            unless ($p->{NRWORDS}){
                $p->{OUTSTR} .= $p->{WATTR}->{word}.' ';
            }
	    $p->{WATTR}->{word} = '';
        }
    }
    elsif ($e eq 'w'){
	if ($opt_M || ($p->{OPENSID} eq $p->{IDS}->[0])){
	    $p->{OPENW} = 0;
	    my @factors=();
	    foreach my $f (@{$p->{FACTORS}}){
		$p->{WATTR}->{$f}=~s/^\s+//s;
		$p->{WATTR}->{$f}=~s/\s+$//s;
		$p->{WATTR}->{$f}=~tr/| \n/___/;    # ' ' and '|' not allowed!
		$p->{WATTR}->{$f} = 'UNKNOWN' if ($p->{WATTR}->{$f}!~/\S/);
		if ($f eq 'lem'){
		    if ($p->{WATTR}->{$f}=~/UNKNOWN/){
			$p->{WATTR}->{$f}=$p->{WATTR}->{word};
		    }
		    elsif ($p->{WATTR}->{$f}=~/\@card\@/){
			$p->{WATTR}->{$f}=$p->{WATTR}->{word};
		    }
		}
		if ($opt_l){
		    $p->{WATTR}->{$f}=lc($p->{WATTR}->{$f});
		}
		push(@factors,$p->{WATTR}->{$f});
	    }
	    if (defined $p->{OUT}){
#		my $OUT = $p->{OUT};
		$p->{OUTSTR}.=join('|',@factors);  # save string
		$p->{OUTSTR}.=' ';
#		print $OUT join('|',@factors);     # instead of printing
#		print $OUT ' ';                    # directly!
	    }
	}
    }
}



sub normalize {
    $_[0] =~s/\s+/ /gs;  # normalize white space characters
    $_[0] =~s/^\s*//;
    $_[0] =~s/\s*$//;
    $_[0] =~ s/[\x00-\x1f\x7f\n]//gs;             # control characters
    $_[0] =~ s/\<(s|unk|\/s|\s*and\s*|)\>//gs;    # reserved words
    $_[0] =~ s/\[\s*and\s*\]//gs;
    $_[0] =~ s/\|/_/gs;                           # vertical bars
    return $_[0];
}
