#!/usr/bin/perl
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements.  See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership.  The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License.  You may obtain a copy of the License at
##
##     http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.

# Grammar into HTML
# Read in grammar.txt and the tokens.txt file
#   jj2html grammar.txt tokens.txt

if ( $#ARGV != 1 )
{
    print STDERR "Usage: grammar.txt tokens.txt\n" ;
    exit 1 ;
}

$/ = undef ;
# Just table or full page.
$TABLE = 1 ;

$grammarFile = $ARGV[0] ;
$tokensFile = $ARGV[1] ;

$grammar = &readFile($grammarFile) ;
$tokens = &readFile($tokensFile) ;

$grammar =~ s!DOCUMENT START!! ;
# $grammar =~ s!NON-TERMINALS!! ;
$grammar =~ s!DOCUMENT END!! ;
$grammar =~ s!TOKENS.*NON-TERMINALS!!s ;

$grammar =~ s!//.*!!g ;
$grammar =~ s!\r!!g ;

# remove leading whitespace
$grammar =~ s!^[\n\s]*!\n! ;

# Merge alts
$grammar =~ s!\n\s*\|!\ |!g ;

$tokens =~ s!//.*!!g ;
$tokens =~ s!\r!!g ;


## Grammar
#print "GRAMMAR\n" ;

@g = split(/\n\s*/, $grammar) ;

@rules = () ;
%ruleMap = () ;
%tokenMap = () ;
%inline = () ;

# Grammar rules
# Direct from "jjdoc -TEXT=true"

for $g (@g) {
    ($rulename, $rulebody) = split(/:=/,$g) ;

    $rulename =~ s!^\s*!! ;
    $rulename =~ s!\s*$!! ;

    $rulebody =~ s!^\s*!! ;
    $rulebody =~ s!\s*$!! ;

    
    # Remove outer brackets
#    $rulebody =~ s!^\((.*)\)$!$1! ;

    next if $rulename eq '' ;

    push @rules, $rulename ;
    warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ;
    $ruleMap{$rulename} = $rulebody ;

##     print "----------\n" ;
##     print $rulename,"\n" ;
##     print $rulebody,"\n" ;
}


# Tokens
# Produced by "jj2tokens"
# Hand edited to indicate the inlines

$tokens =~ s/\n+/\n/g ;
$tokens =~ s/^\n// ;

@t = split(/\n(?=\<|\[)/, $tokens) ;

for $t (@t) {
    ($tokenname,$tokenbody) = split(/::=/, $t) ;
    $tokenname =~ s!^\s*!! ;
    $tokenname =~ s!\s*$!! ;
    $tokenname =~ s/#// ;

    $tokenbody =~ s!^\s*!! ;
    $tokenbody =~ s!\s*$!! ;

    # Inline?
    if ( $tokenname =~ /^\[\<\w*\>\]/ ) {
	warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ;
	$tokenname =~ s/^\[//g ;
	$tokenname =~ s/\]$//g ;
	$tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how?
        $tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup
	$inline{$tokenname} = $tokenbody ;

	#print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ;
    } else {
	push @rules, $tokenname ;
	warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ;
	$ruleMap{$tokenname} = $tokenbody ; 
    }
    
}

# Table

if ( ! $TABLE ) {
    print "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" ;
    print "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n" ;
    print "    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
    print "\n" ;

    print "<html>\n";
    print "<head>\n";
    print "<title>SPARQL Grammar</title>\n" ;
    print "<style type=\"text/css\">\n" ;

 # .token inline
 # .ruleHead
 # .ruleBody

    print <<'EOF' ;
div.grammarTable table * { border-width: 0 ; }
div.grammarTable table * tr { border: 1px solid black ; }

.grammar     { text-align: left ; vertical-align: top ; }
.token       { color: #3f3f5f; }
.gRuleHead   { font-style: italic ; font-family: monospace ; }
.gRuleBody   { font-family: monospace ; }
.gRuleLabel  { font-family: monospace ; }
EOF

     print "</style>\n" ;
     print "</head>\n";
     print "<body>\n";

    print "\n" ;
} ## End TABLE

$indent = "        ";
## $indent = "";

print $indent,"<div class=\"grammarTable\">\n" ;
print $indent,"  <table><tbody>\n" ;

$ruleNum = 0 ;

for $r (@rules) {
    $DEBUG = 0 ;
    $ruleNum++ ;
    $rulename = $r ;
    
    $rulebody = $ruleMap{$rulename} ;

##    $DEBUG = 1 if ( $rulename =~ /Prolog/ ) ;


    if ( $DEBUG ) {
	print STDERR "\n" ;
	print STDERR "Rule: $rulename\n" ; 
	print STDERR "Body: $rulebody\n" ; 
    }

    $ruleBodyStr = $rulebody ;
    # Escape HTML chars before adding markup.
    $ruleBodyStr = esc($ruleBodyStr) ;
    
    # Inlines
    for $k (keys %inline) {
	$s = '<span class="token">' . esc($inline{$k}) . '</span>' ;
	$k = esc($k) ;
	# Assumes escaped <> round tokens.
	$k = quotemeta $k ;
	$ruleBodyStr =~ s/$k/$s/g ;
    }

    if ( $DEBUG ) {
	print STDERR "After inlining\n" ;
	print STDERR $ruleBodyStr,"\n" ; ; 
    }

    # Add hrefs - issue if one is a substring of another \W helps.

    for $k (keys %ruleMap) {
	$s = '<a href="#r' . sane($k) . '">' . esc($k) . '</a>' ;

	$k = esc($k) ;
	$k = quotemeta $k ;

## 	if ( $DEBUG )
## 	{
## 	  print STDERR "K:$k\n" ;  
## 	}


	$ruleBodyStr =~ s/(?=\W)(\s*)$k(\s*)(?=\W)/$1$s$2/g ;
	$ruleBodyStr =~ s/^$k(\s*)(?=\W)/$s$1/g ;
	$ruleBodyStr =~ s/(?=\W)(\s*)$k$/$1$s/g ;
	$ruleBodyStr =~ s/^$k$/$s/g ;
    }
    
    if ( $DEBUG ) {
	print STDERR "After hrefs\n" ;
	print STDERR $ruleBodyStr,"\n" ; ; 
    }

    #exit if $ruleNum > 2 ;

    $ruleId = sane("r".$rulename) ;

    if($rulename eq 'IRIREF') {
	print $indent,"  </tbody></table>\n" ;
	print $indent,"</div>\n" ;
	print $indent,"<p>Productions for terminals:</p>\n" ;
	print $indent,"<div class=\"grammarTable\">\n" ;
	print $indent,"  <table><tbody>\n" ;
    }

    print "\n" ;
    $indentRule = "              ";
    ##$indentRule = "";
    print $indentRule,"<tr style=\"vertical-align: baseline\">\n";
    
    $rlabel = '[' . $ruleNum .  ']&nbsp;&nbsp;' ;

    print $indentRule,"  <td><code>",$rlabel,"</code></td>\n" ;

    $rn = $rulename;
    $rn =~ s!^<!!; 
    $rn =~ s!>$!!; 
    
    print $indentRule,
	'  <td><code><span class="doc-ref" id="',$ruleId,'">',
	esc($rn),
	'</span></code></td>',"\n";
    
    print $indentRule,"  <td>&nbsp;&nbsp;::=&nbsp;&nbsp;</td>\n" ;
    
    $ruleBodyStr = fixupRule($rulename, $ruleBodyStr) ;
    print $indentRule,"  <td>",code('gRuleBody',$ruleBodyStr),"</td>\n" ;

    print $indentRule,"</tr>\n" ;

#    $rule{$rulename, $rulebody) ;
#    print $rulename , "\n" ;
}

print $indent,"  </tbody></table>\n" ;
print $indent,"</div>\n" ;

if ( !$TABLE ) {
    print "\n" ;
    print "</body>\n" ;
    print "</html>\n" ;
}

sub readFile {
    my $f = $_[0] ;
    open(F, "$f") || die "$f: $!"; 
    my $s = <F> ;
    return $s ;
}

sub esc {
    my $s = $_[0] ;
    $s =~ s/&/&amp;/g ; 
    $s =~ s/</&lt;/g ; 
    $s =~ s/>/&gt;/g ; 
    return $s ;
}

sub sane {
   my $a = $_[0] ;
   $a =~ s/\W//g ;
   return $a ;
}

sub code {
    my $c = $_[0] ;
    my $t = $_[1] ;
    return '<code class="' . $c . '">' . $t . '</code>' ;
}

sub fixupHead {
    my $head = $_[0] ;
    # Remove <> around tokens.
    $head =~ s/&lt;(\w+)&gt;/$1/g ;
    return $head ;
}

sub fixupRule {
    my $head = $_[0] ;
    my $body = $_[1] ;

    # Remove unnecessary ()
    $body =~ s/\(\s*([^()| ]*) \)/$1/g ;

    # Remove outer matching () where there are no inner ()
    $body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ;

    # ( A )* => A* and for + and ? where A is a linked or spanned object
    $body =~ s!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)!$1!g ;
    $body =~ s!\(\s+(\<span[^>]*\>[^<>]*\</span\>)\s+\)!$1!g ;

    # There aren't any of these
##    $body =~ s!\(\s+(\S*)\s+\)!$1!g ;

    # Remove <> around tokens.
    $body =~ s/&lt;(\w+)&gt;/$1/g ;

    # Specials
    # Split long bodies
    if ( $head eq "CallExpression" ||
	 $head eq "UnaryExpression" ||
	 $head eq "<NCCHAR1p>" ||
	 $head eq "PatternElement" ||
	 $head eq "BuiltInCall" )
    {
	$body =~ s%\|%\<br/\>\|%g ;
	$body =~ s/^\s+// ;
	$body = "&nbsp;&nbsp;".$body ;
    }

     if ( $head eq "Aggregate" )
     {
	 # Strip outer ()
	 $body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
	 ## Be careful of nested | in COUNT
 	 $body =~ s%(\| \<span class="token")%\<br/\>$1%g ;
 	 $body =~ s/^\s+// ;
 	 $body = "&nbsp;&nbsp;".$body ;
     }


    if (  $head eq "BuiltInCall" )
    {
	# Undo <br/> for BNODE, RAND
	# <br/>| <a href="#rNIL">NIL</a> )
	$body =~ s%\<br/\>\| *\<a href="#rNIL"\>NIL\</a\>%\| \<a href="#rNIL"\>NIL\</a\>%g ;
    }

##     if ( $head eq "RelationalExpression" ||
## 	 $head eq "AdditiveExpression" ||
## 	 $head eq "MultiplicativeExpression" ||
## 	 $head eq "ConditionalOrExpression")
##     {
## 	$body =~ s%\*\(%<br/>\(% ;
##     }

    # These failed the outer () test because they have nested () in them
    if (  $head eq "QueryPattern" ||
          $head eq "OrderCondition" )
    {
	# Remove outer ()
	$body =~ s/^\((.*)\)$/$1/ ;	
    }

    if (  $head eq "Query" )
    {
	$body =~ s! \(!<br/>\(! ;
	$body =~ s!\) !\)<br/>! ;
    }

    if (  $head =~ m/(Select|Construct|Describe|Ask)Query/ )
    {
	# Put a line break before the DatasetClause
	# <a href="#rDatasetClause">DatasetClause</a>
	$c = '<a href="#rDatasetClause">DatasetClause</a>' ;
	$c = quotemeta $c ;
	# Expects the dataset clause to be unbracketted
	$body =~ s!(\(\s*$c)!<br/>$1! ;
    }

    if ( $head eq "OrderCondition" )
    {
	$body =~ s!\)\s*\|\s*\(!\)<br/>\| \(! ;
	$body = "  ".$body ;
    }

    #Rules where an outer () is unnecessary.
    if ( $head eq "GroupCondition" ||
	 $head eq "LimitOffsetClauses" ||
	 $head eq "GraphOrDefault" ||
	 $head eq "ArgList" ||
	 $head eq "ExpressionList" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathMod" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathNegatedPropertySet" || 
	 $head eq "PathOneInPropertySet")
    {
	$body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
    }
    return $body ;
}
