/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2000-2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *     "Apache Jetspeed" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache" or
 *    "Apache Jetspeed", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

/*
 *
 *
 *  COMPATIBILITY
 *  
 *      [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.3
 *      [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.2
 *
 *
 *
 *  FEATURES
 *      = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,
 *          <INPUT SRCs, <APPLET CODEBASEs
 *      = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,
 *          <NOSCRIPT>
 *      
 *
 *  KNOWN PROBLEMS
 *
 *
 *  == Seems to have problems with international characters, when the web-pages
 *     are not downloaded from the original URL but taken from the cache.
 *     (To reproduce do the following
 *      1. create a new portlet from the url http://www.sycom.at/default.htm
 *      2. stop tomcat & restart tomcat
 *      3. login and customize your page to include this portlet
 *      4. everything should appear fine, the webpage will show some german 
 *         umlauts
 *      5. shutdown tomcat and restart it
 *      6. jetspeed is now taking the HTML not from www.sycom.at, but from the
 *         cache. Instead of the umlauts, you will see weird characters. 
 *
 *
 *  == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed
 *     single tags like <BR /> screw the output up.
 *      
 *
 *
 */


package org.apache.jetspeed.portlets.util;

import javax.swing.text.html.parser.*;
import javax.swing.text.html.*;
import javax.swing.text.html.HTMLEditorKit;
import java.io.*;
import java.util.*;
import javax.swing.text.*;
import java.net.*;

/**
 *
 * @author  Ingo Rammer (rammer@sycom.at)
 * @version 0.1
 */

public class HTMLRewriter {

    private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
    
/** Sets the parameters for the HTMLRewriter
 * @param removeScript Shall SCRIPT-Tags and their content be removed
 * @param removeStyle Shall STYLE-Tags and their content be removed
 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
 * @param removeMeta Shall META-Tags be removed
 * @param removeApplet Shall APPLET-Tags and their content be removed
 * @param removeObject Shall OBJECT-Tags and their content be removed
 * @param removeHead Shall HEAD-Tags and their content be removed
 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
 */    
    public HTMLRewriter (boolean removeScript,
                         boolean removeStyle,
                         boolean removeNoScript,
                         boolean removeMeta,
                         boolean removeApplet,
                         boolean removeObject,
                         boolean removeHead,
                         boolean removeOnSomething) 
    {
        cb.removeScript = removeScript;
        cb.removeStyle = removeStyle; 
        cb.removeNoScript = removeNoScript;
        cb.removeMeta = removeMeta;
        cb.removeApplet = removeApplet;
        cb.removeObject = removeObject;
        cb.removeHead = removeHead;
        cb.removeOnSomething = removeOnSomething;    
    }
    
/** Does the conversion of the HTML-String
 * @param HTML HTML-String to be converted
 * @param BaseUrl URL from which this HTML was taken. We be the base-Url
 * for all URL-rewritings.
 * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
 * the document could not be converted. Should not happen
 * normally, even in badly formatted HTML.
 * @return HTML-String with rewritten URLs and removed (according
 * to constructor-settings) tags
 */    
    public synchronized String convertURLs(String HTML, String BaseUrl) throws MalformedURLException
    {
        HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();        
        StringReader read = new StringReader(HTML);
        String res ="";
        try {
            cb.baseUrl = new URL(BaseUrl);
            parse.parse(read,cb,true);
            res = cb.getResult(); 
        } catch (Exception e)
        {
            e.printStackTrace();
            throw new MalformedURLException(e.toString());
        }
        return res;
    }

    
    /** That Class is needed, because getParser is protected and therefore 
     *  only accessibly by a subclass
     */
    class ParserGetter extends HTMLEditorKit {
    /** This is needed, because getParser is protected
     * @return Html Parser
     */        
      public HTMLEditorKit.Parser getParser(){
        return super.getParser();
      }
    } 

    
    class Callback extends HTMLEditorKit.ParserCallback {

        // the base-url of which the given html comes from.
        private URL baseUrl;

        // either handling of <FORM> is buggy, or I made some weird mistake ... 
        // ... JDK 1.3 sends double "</form>"-tags on closing <form>
        private boolean inForm = false; 

        
        // when in multi-part ignored tags (like <script> foobar </script>, 
        // <style> foobar </style>, a counter for the nesting-level will be
        // kept here
        private int ignoreLevel = 0;
        
        private boolean removeScript = true;
        private boolean removeStyle = true; 
        private boolean removeNoScript = true;
        private boolean removeMeta = true;
        private boolean removeApplet = true;
        private boolean removeObject = true;
        private boolean removeHead = true;
        
        // remove the onClick=, onBlur=, etc. - Attributes
        private boolean removeOnSomething = true;
        
        private StringWriter result = new StringWriter();
        
        private Callback () {
        }
        
        
        private Callback addToResult(Object txt)
        {
            // to allow for implementation using Stringbuffer or StringWriter
            // I don't know yet, which one is better in this case
            if (ignoreLevel > 0 ) return this;

            try {
                result.write(txt.toString());
            } catch (Exception e) { /* ignore */ }
            return this;
        }

        private Callback addToResult(char[] txt)
        {
            if (ignoreLevel > 0) return this;
            
            try {
                result.write(txt);
            } catch (Exception e) { /* ignore */ }
            return this;
        }
        
        /** Accessor to the Callback's content-String
         * @return Cleaned and rewritten HTML-Content
         */        
        public String getResult() {
            try {
                result.flush();
            } catch (Exception e) { /* ignore */ }
            
            // WARNING: doesn't work, if you remove " " + ... but don't know why
            String res = " " + result.toString(); 

            return res;
        }
        
       
        public void flush() throws javax.swing.text.BadLocationException {
            // nothing to do here ...
        }

        public void handleComment(char[] values,int param) {
            // we ignore them 
        }

        public void handleEndOfLineString(java.lang.String str) {
            addToResult("\n");
        }

        public void handleError(java.lang.String str,int param) {
            // ignored
        }

        public void handleSimpleTag(HTML.Tag tag,MutableAttributeSet attrs,int param) {
            if (removeMeta && (tag == HTML.Tag.META)) {
                return;
            }
            
            appendTagToResult(tag,attrs);        
        }

        public void handleStartTag(HTML.Tag tag,  MutableAttributeSet attrs, int position) {
          appendTagToResult(tag,attrs);
        }

        public void handleEndTag(HTML.Tag tag, int position) {
            if ((tag ==HTML.Tag.FORM) && (inForm)) { 
                // form handling seems to be buggy
                addToResult("</").addToResult(tag).addToResult(">");
                inForm = false;
            } else if (tag == HTML.Tag.FORM) {
                // do nothing! ... i.e. we are now outside of any <FORM>, so a
                // closing </form> is not really needed ... 
            } else {
                addToResult("</").addToResult(tag).addToResult(">");
            }
            
            
            if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
                ignoreLevel --;
            } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
                ignoreLevel --;
            } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
                ignoreLevel --;
            } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
                ignoreLevel --;
            } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
                ignoreLevel --;
            } else if ( removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
                ignoreLevel --;
            }
        }
  
        private void appendTagToResult(HTML.Tag tag, MutableAttributeSet attrs) {

            if (tag.toString().equalsIgnoreCase("__ENDOFLINETAG__")) {
                // jdk 1.2.2 places a tag <__ENDOFLINETAG__> in the result ...
                // we don't want this one
                return;
            }
            
            if (tag.toString().equalsIgnoreCase("__IMPLIED__")) {
                // jdk 1.3 places a tag <__IMPLIED__> in the result ...
                // we don't want this one
                return;
            }
            
            convertURLS(tag,attrs);
            Enumeration e = attrs.getAttributeNames();
            addToResult("<").addToResult(tag);
            while (e.hasMoreElements()) {
              Object attr = e.nextElement();
              String onsomething = attr.toString().substring(0,2).toUpperCase();
              if (removeOnSomething && (! onsomething.equals("ON"))) {  
                  // filter the onClick, onThis, onThat-Attributes
                  String value = attrs.getAttribute(attr).toString();
                  addToResult(" ").addToResult(attr).addToResult("=\"").addToResult(value).addToResult("\"");
              }
            }
            addToResult(">");
        }

        
        /** Here the magic happens.
         *
         * If someone wants new types of URLs to be rewritten, add them here
         * @param tag TAG from the Callback-Interface
         * @param attrs Attribute-Set from the Callback-Interface
 */
        
        private void convertURLS( HTML.Tag tag, MutableAttributeSet attrs ) {

           // first we do an URL-rewrite on different tags
            
            if ((tag == HTML.Tag.A) && (attrs.getAttribute(HTML.Attribute.HREF) != null)) {
                // ---- CHECKING <A HREF
                addConvertedAttribute( HTML.Attribute.HREF,
                                       attrs );
            } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT)) && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
                // ---- CHECKING <IMG SRC & <INPUT SRC
                addConvertedAttribute( HTML.Attribute.SRC,
                                       attrs );
            } else if (((tag == HTML.Tag.OPTION) ) && (attrs.getAttribute(HTML.Attribute.VALUE) != null)) {
                // ---- CHECKING <OPTION 
                addConvertedAttribute( HTML.Attribute.VALUE,
                                       attrs );
            } else if ( tag == HTML.Tag.APPLET ) {
                // ---- CHECKING <APPLET CODEBASE=
                addConvertedAttribute( HTML.Attribute.CODEBASE,
                                       attrs );
            } else if (tag == HTML.Tag.FORM) {
                // ---- CHECKING <FORM ACTION=
                  inForm = true; // buggy <form> handling in jdk 1.3 
                  if (attrs.getAttribute(HTML.Attribute.ACTION) == null) {
                      //self referencing <FORM>
                       attrs.addAttribute(HTML.Attribute.ACTION,
                                          baseUrl.toString());
                  } else {
                        addConvertedAttribute( HTML.Attribute.ACTION,
                                               attrs );
                  }
            } else if (tag == HTML.Tag.TD) {
                // ---- CHECKING <TD BACKGROUND=
                  if (! (attrs.getAttribute(HTML.Attribute.BACKGROUND) == null)) {
                      addConvertedAttribute( HTML.Attribute.BACKGROUND,
                                             attrs );
                  }
            }

            
            // then we check for ignored tags ...
            // btw. I know, that this code could be written in a shorter way, but
            // I think it's more readable like this ...

            // don't forget to add changes to  handleEndTag() as well, else 
            // things will get screwed up!
            
            if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
                  ignoreLevel ++;
            } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
                  ignoreLevel ++;
            } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
                  ignoreLevel ++;
            } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
                  ignoreLevel ++;
            } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
                  ignoreLevel ++;
            } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
                  ignoreLevel ++;
            }

        }

        /**
         *
         * Converts the given attribute to base URL, if not null
         *
         */
        private void addConvertedAttribute( HTML.Attribute attr,
                                            MutableAttributeSet attrs ) {
            if( attrs.getAttribute( attr ) != null ) {
                String attrSource =  attrs.getAttribute( attr ).toString();
                attrs.addAttribute( attr,
                                    generateNewUrl( attrSource ) );
            }
        }
              
              
        private String generateNewUrl(String oldURL) 
        {
            try {
                URL x = new URL(baseUrl,oldURL);
                return x.toString();
            } catch (Throwable t)
            {
                //FIXME: transient print to debug...
                System.err.print( "HTMLRewriter: BASE=" );
                System.err.print( baseUrl );
                System.err.print( "old=" );
                System.err.println( oldURL );
                t.printStackTrace();
                return oldURL; // default behaviour ... 
            }
        }

        public void handleText(char[] values,int param) {
            addToResult(values);
        }
        
    }
    
}


