Index: src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
===================================================================
--- src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java	(revision 747281)
+++ src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java	(working copy)
@@ -73,9 +73,9 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
-                      .getContent();
-    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content).get(content.getUrl());
+    
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/TokenMgrError.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/TokenMgrError.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/TokenMgrError.java	(revision 0)
@@ -0,0 +1,148 @@
+/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 3.0 */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+public class TokenMgrError extends Error
+{
+   /*
+    * Ordinals for various reasons why an Error of this type can be thrown.
+    */
+
+   /**
+    * Lexical error occured.
+    */
+   static final int LEXICAL_ERROR = 0;
+
+   /**
+    * An attempt wass made to create a second instance of a static token manager.
+    */
+   static final int STATIC_LEXER_ERROR = 1;
+
+   /**
+    * Tried to change to an invalid lexical state.
+    */
+   static final int INVALID_LEXICAL_STATE = 2;
+
+   /**
+    * Detected (and bailed out of) an infinite loop in the token manager.
+    */
+   static final int LOOP_DETECTED = 3;
+
+   /**
+    * Indicates the reason why the exception is thrown. It will have
+    * one of the above 4 values.
+    */
+   int errorCode;
+
+   /**
+    * Replaces unprintable characters by their espaced (or unicode escaped)
+    * equivalents in the given string
+    */
+   protected static final String addEscapes(String str) {
+      StringBuffer retval = new StringBuffer();
+      char ch;
+      for (int i = 0; i < str.length(); i++) {
+        switch (str.charAt(i))
+        {
+           case 0 :
+              continue;
+           case '\b':
+              retval.append("\\b");
+              continue;
+           case '\t':
+              retval.append("\\t");
+              continue;
+           case '\n':
+              retval.append("\\n");
+              continue;
+           case '\f':
+              retval.append("\\f");
+              continue;
+           case '\r':
+              retval.append("\\r");
+              continue;
+           case '\"':
+              retval.append("\\\"");
+              continue;
+           case '\'':
+              retval.append("\\\'");
+              continue;
+           case '\\':
+              retval.append("\\\\");
+              continue;
+           default:
+              if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
+                 String s = "0000" + Integer.toString(ch, 16);
+                 retval.append("\\u" + s.substring(s.length() - 4, s.length()));
+              } else {
+                 retval.append(ch);
+              }
+              continue;
+        }
+      }
+      return retval.toString();
+   }
+
+   /**
+    * Returns a detailed message for the Error when it is thrown by the
+    * token manager to indicate a lexical error.
+    * Parameters : 
+    *    EOFSeen     : indicates if EOF caused the lexicl error
+    *    curLexState : lexical state in which this error occured
+    *    errorLine   : line number when the error occured
+    *    errorColumn : column number when the error occured
+    *    errorAfter  : prefix that was seen before this error occured
+    *    curchar     : the offending character
+    * Note: You can customize the lexical error message by modifying this method.
+    */
+   protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
+      return("Lexical error at line " +
+           errorLine + ", column " +
+           errorColumn + ".  Encountered: " +
+           (EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
+           "after : \"" + addEscapes(errorAfter) + "\"");
+   }
+
+   /**
+    * You can also modify the body of this method to customize your error messages.
+    * For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
+    * of end-users concern, so you can return something like : 
+    *
+    *     "Internal Error : Please file a bug report .... "
+    *
+    * from this method for such cases in the release version of your parser.
+    */
+   public String getMessage() {
+      return super.getMessage();
+   }
+
+   /*
+    * Constructors of various flavors follow.
+    */
+
+   public TokenMgrError() {
+   }
+
+   public TokenMgrError(String message, int reason) {
+      super(message);
+      errorCode = reason;
+   }
+
+   public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
+      this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
+   }
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/Token.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/Token.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/Token.java	(revision 0)
@@ -0,0 +1,96 @@
+/* Generated By:JavaCC: Do not edit this line. Token.java Version 3.0 */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+/**
+ * Describes the input token stream.
+ */
+
+public class Token {
+
+  /**
+   * An integer that describes the kind of this token.  This numbering
+   * system is determined by JavaCCParser, and a table of these numbers is
+   * stored in the file ...Constants.java.
+   */
+  public int kind;
+
+  /**
+   * beginLine and beginColumn describe the position of the first character
+   * of this token; endLine and endColumn describe the position of the
+   * last character of this token.
+   */
+  public int beginLine, beginColumn, endLine, endColumn;
+
+  /**
+   * The string image of the token.
+   */
+  public String image;
+
+  /**
+   * A reference to the next regular (non-special) token from the input
+   * stream.  If this is the last token from the input stream, or if the
+   * token manager has not read tokens beyond this one, this field is
+   * set to null.  This is true only if this token is also a regular
+   * token.  Otherwise, see below for a description of the contents of
+   * this field.
+   */
+  public Token next;
+
+  /**
+   * This field is used to access special tokens that occur prior to this
+   * token, but after the immediately preceding regular (non-special) token.
+   * If there are no such special tokens, this field is set to null.
+   * When there are more than one such special token, this field refers
+   * to the last of these special tokens, which in turn refers to the next
+   * previous special token through its specialToken field, and so on
+   * until the first special token (whose specialToken field is null).
+   * The next fields of special tokens refer to other special tokens that
+   * immediately follow it (without an intervening regular token).  If there
+   * is no such token, this field is null.
+   */
+  public Token specialToken;
+
+  /**
+   * Returns the image.
+   */
+  public String toString()
+  {
+     return image;
+  }
+
+  /**
+   * Returns a new Token object, by default. However, if you want, you
+   * can create and return subclass objects based on the value of ofKind.
+   * Simply add the cases to the switch for all those special cases.
+   * For example, if you have a subclass of Token called IDToken that
+   * you want to create if ofKind is ID, simlpy add something like :
+   *
+   *    case MyParserConstants.ID : return new IDToken();
+   *
+   * to the following switch statement. Then you can cast matchedToken
+   * variable to the appropriate type and use it in your lexical actions.
+   */
+  public static final Token newToken(int ofKind)
+  {
+     switch(ofKind)
+     {
+       default : return new Token();
+     }
+  }
+
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegate.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegate.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegate.java	(revision 0)
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2001 eTranslate, Inc. All Rights Reserved
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * Contact: <eric@etranslate.com>
+ */
+
+package org.apache.nutch.parse.rtf;
+
+import java.util.List;
+
+/**
+ * Implemented by classes that receive RTFParser messages.
+ *
+ * Created: Tue Jul  3 10:29:05 2001
+ *
+ * @author Eric Friedman
+ * @version $Id: RTFParserDelegate.java,v 1.1 2005/03/26 14:26:06 agrebnev Exp $
+ */
+
+public interface RTFParserDelegate {
+    /** CVS version info for this interface */
+    public static final String VERSION = "$Id: RTFParserDelegate.java,v 1.1 2005/03/26 14:26:06 agrebnev Exp $";
+
+    /** constants representing RTF contexts in which text events may occur */
+    public static final int IN_DOCUMENT = 0;
+    public static final int IN_FONTTBL = 1;
+    public static final int IN_FILETBL = 2;
+    public static final int IN_COLORTBL = 3;
+    public static final int IN_STYLESHEET = 4;
+    public static final int IN_LISTTABLE = 5;
+    public static final int IN_STYLE = 6;
+    public static final int IN_REVTBL = 7;
+    public static final int IN_INFO = 8;
+    public static final int IN_PNTEXT = 9;
+    public static final String NO_STYLE = new String();
+    
+    /**
+     * Receive a block of text from the RTF document.  The text is
+     * in the named style and occurs in <code>context</code.
+     *
+     * <p>Style is guaranteed to have object identity with one of the
+     * styles in the list provided by the styleList message, if that
+     * has been called.</p>
+     *
+     * @param text a <code>String</code> value
+     * @param style a <code>String</code> value
+     * @param context an <code>int</code> value
+     */
+    public void text(String text, String style, int context);
+
+    /**
+     * Receive a control symbol in a particular context.
+     *
+     * @param controlSymbol a <code>String</code> value
+     * @param context an <code>int</code> value
+     */
+    public void controlSymbol(String controlSymbol, int context);
+
+    /**
+     * Receive a control word in a particular context.  The value, if
+     * not provided, will be <code>0</code> as per the RTF spec.
+     *
+     * @param controlWord a <code>String</code> value
+     * @param value an <code>int</code> value
+     * @param context an <code>int</code> value
+     */
+    public void controlWord(String controlWord, int value, int context);
+
+    /**
+     * Receive notification about the opening of an RTF group with the
+     * specified depth. The depth value is that of the group just opened.
+     *
+     * @param depth an <code>int</code> value
+     */
+    public void openGroup(int depth);
+
+    /**
+     * Receive notification about the closing of an RTF group with the
+     * specified depth.  The depth value is that of the group just closed.
+     *
+     * @param depth an <code>int</code> value
+     */
+    public void closeGroup(int depth);
+
+    /**
+     * Receive notification about the list of style names defined for the
+     * document
+     *
+     * @param styles a <code>List</code> of <code>String</code> objects.
+     */
+    public void styleList(List styles);
+    
+    /**
+     * The document parsing has begun.
+     *
+     */
+    public void startDocument();
+
+    /**
+     * Parsing is complete.
+     *
+     */
+    public void endDocument();
+
+}// RTFParserDelegate
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/SimpleCharStream.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/SimpleCharStream.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/SimpleCharStream.java	(revision 0)
@@ -0,0 +1,454 @@
+/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.0 */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+/**
+ * An implementation of interface CharStream, where the stream is assumed to
+ * contain only ASCII characters (without unicode processing).
+ */
+
+public class SimpleCharStream
+{
+  public static final boolean staticFlag = false;
+  int bufsize;
+  int available;
+  int tokenBegin;
+  public int bufpos = -1;
+  protected int bufline[];
+  protected int bufcolumn[];
+
+  protected int column = 0;
+  protected int line = 1;
+
+  protected boolean prevCharIsCR = false;
+  protected boolean prevCharIsLF = false;
+
+  protected java.io.Reader inputStream;
+
+  protected char[] buffer;
+  protected int maxNextCharInd = 0;
+  protected int inBuf = 0;
+  protected int tabSize = 8;
+
+  protected void setTabSize(int i) { tabSize = i; }
+  protected int getTabSize(int i) { return tabSize; }
+
+
+  protected void ExpandBuff(boolean wrapAround)
+  {
+     char[] newbuffer = new char[bufsize + 2048];
+     int newbufline[] = new int[bufsize + 2048];
+     int newbufcolumn[] = new int[bufsize + 2048];
+
+     try
+     {
+        if (wrapAround)
+        {
+           System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
+           System.arraycopy(buffer, 0, newbuffer,
+                                             bufsize - tokenBegin, bufpos);
+           buffer = newbuffer;
+
+           System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
+           System.arraycopy(bufline, 0, newbufline, bufsize - tokenBegin, bufpos);
+           bufline = newbufline;
+
+           System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
+           System.arraycopy(bufcolumn, 0, newbufcolumn, bufsize - tokenBegin, bufpos);
+           bufcolumn = newbufcolumn;
+
+           maxNextCharInd = (bufpos += (bufsize - tokenBegin));
+        }
+        else
+        {
+           System.arraycopy(buffer, tokenBegin, newbuffer, 0, bufsize - tokenBegin);
+           buffer = newbuffer;
+
+           System.arraycopy(bufline, tokenBegin, newbufline, 0, bufsize - tokenBegin);
+           bufline = newbufline;
+
+           System.arraycopy(bufcolumn, tokenBegin, newbufcolumn, 0, bufsize - tokenBegin);
+           bufcolumn = newbufcolumn;
+
+           maxNextCharInd = (bufpos -= tokenBegin);
+        }
+     }
+     catch (Throwable t)
+     {
+        throw new Error(t.getMessage());
+     }
+
+
+     bufsize += 2048;
+     available = bufsize;
+     tokenBegin = 0;
+  }
+
+  protected void FillBuff() throws java.io.IOException
+  {
+     if (maxNextCharInd == available)
+     {
+        if (available == bufsize)
+        {
+           if (tokenBegin > 2048)
+           {
+              bufpos = maxNextCharInd = 0;
+              available = tokenBegin;
+           }
+           else if (tokenBegin < 0)
+              bufpos = maxNextCharInd = 0;
+           else
+              ExpandBuff(false);
+        }
+        else if (available > tokenBegin)
+           available = bufsize;
+        else if ((tokenBegin - available) < 2048)
+           ExpandBuff(true);
+        else
+           available = tokenBegin;
+     }
+
+     int i;
+     try {
+        if ((i = inputStream.read(buffer, maxNextCharInd,
+                                    available - maxNextCharInd)) == -1)
+        {
+           inputStream.close();
+           throw new java.io.IOException();
+        }
+        else
+           maxNextCharInd += i;
+        return;
+     }
+     catch(java.io.IOException e) {
+        --bufpos;
+        backup(0);
+        if (tokenBegin == -1)
+           tokenBegin = bufpos;
+        throw e;
+     }
+  }
+
+  public char BeginToken() throws java.io.IOException
+  {
+     tokenBegin = -1;
+     char c = readChar();
+     tokenBegin = bufpos;
+
+     return c;
+  }
+
+  protected void UpdateLineColumn(char c)
+  {
+     column++;
+
+     if (prevCharIsLF)
+     {
+        prevCharIsLF = false;
+        line += (column = 1);
+     }
+     else if (prevCharIsCR)
+     {
+        prevCharIsCR = false;
+        if (c == '\n')
+        {
+           prevCharIsLF = true;
+        }
+        else
+           line += (column = 1);
+     }
+
+     switch (c)
+     {
+        case '\r' :
+           prevCharIsCR = true;
+           break;
+        case '\n' :
+           prevCharIsLF = true;
+           break;
+        case '\t' :
+           column--;
+           column += (tabSize - (column % tabSize));
+           break;
+        default :
+           break;
+     }
+
+     bufline[bufpos] = line;
+     bufcolumn[bufpos] = column;
+  }
+
+  public char readChar() throws java.io.IOException
+  {
+     if (inBuf > 0)
+     {
+        --inBuf;
+
+        if (++bufpos == bufsize)
+           bufpos = 0;
+
+        return buffer[bufpos];
+     }
+
+     if (++bufpos >= maxNextCharInd)
+        FillBuff();
+
+     char c = buffer[bufpos];
+
+     UpdateLineColumn(c);
+     return (c);
+  }
+
+  /**
+   * @deprecated 
+   * @see #getEndColumn
+   */
+
+  public int getColumn() {
+     return bufcolumn[bufpos];
+  }
+
+  /**
+   * @deprecated 
+   * @see #getEndLine
+   */
+
+  public int getLine() {
+     return bufline[bufpos];
+  }
+
+  public int getEndColumn() {
+     return bufcolumn[bufpos];
+  }
+
+  public int getEndLine() {
+     return bufline[bufpos];
+  }
+
+  public int getBeginColumn() {
+     return bufcolumn[tokenBegin];
+  }
+
+  public int getBeginLine() {
+     return bufline[tokenBegin];
+  }
+
+  public void backup(int amount) {
+
+    inBuf += amount;
+    if ((bufpos -= amount) < 0)
+       bufpos += bufsize;
+  }
+
+  public SimpleCharStream(java.io.Reader dstream, int startline,
+  int startcolumn, int buffersize)
+  {
+    inputStream = dstream;
+    line = startline;
+    column = startcolumn - 1;
+
+    available = bufsize = buffersize;
+    buffer = new char[buffersize];
+    bufline = new int[buffersize];
+    bufcolumn = new int[buffersize];
+  }
+
+  public SimpleCharStream(java.io.Reader dstream, int startline,
+                          int startcolumn)
+  {
+     this(dstream, startline, startcolumn, 4096);
+  }
+
+  public SimpleCharStream(java.io.Reader dstream)
+  {
+     this(dstream, 1, 1, 4096);
+  }
+  public void ReInit(java.io.Reader dstream, int startline,
+  int startcolumn, int buffersize)
+  {
+    inputStream = dstream;
+    line = startline;
+    column = startcolumn - 1;
+
+    if (buffer == null || buffersize != buffer.length)
+    {
+      available = bufsize = buffersize;
+      buffer = new char[buffersize];
+      bufline = new int[buffersize];
+      bufcolumn = new int[buffersize];
+    }
+    prevCharIsLF = prevCharIsCR = false;
+    tokenBegin = inBuf = maxNextCharInd = 0;
+    bufpos = -1;
+  }
+
+  public void ReInit(java.io.Reader dstream, int startline,
+                     int startcolumn)
+  {
+     ReInit(dstream, startline, startcolumn, 4096);
+  }
+
+  public void ReInit(java.io.Reader dstream)
+  {
+     ReInit(dstream, 1, 1, 4096);
+  }
+  public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
+  int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
+  {
+     this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
+  }
+
+  public SimpleCharStream(java.io.InputStream dstream, int startline,
+  int startcolumn, int buffersize)
+  {
+     this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
+  }
+
+  public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
+                          int startcolumn) throws java.io.UnsupportedEncodingException
+  {
+     this(dstream, encoding, startline, startcolumn, 4096);
+  }
+
+  public SimpleCharStream(java.io.InputStream dstream, int startline,
+                          int startcolumn)
+  {
+     this(dstream, startline, startcolumn, 4096);
+  }
+
+  public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
+  {
+     this(dstream, encoding, 1, 1, 4096);
+  }
+
+  public SimpleCharStream(java.io.InputStream dstream)
+  {
+     this(dstream, 1, 1, 4096);
+  }
+
+  public void ReInit(java.io.InputStream dstream, String encoding, int startline,
+                          int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
+  {
+     ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
+  }
+
+  public void ReInit(java.io.InputStream dstream, int startline,
+                          int startcolumn, int buffersize)
+  {
+     ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
+  }
+
+  public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
+  {
+     ReInit(dstream, encoding, 1, 1, 4096);
+  }
+
+  public void ReInit(java.io.InputStream dstream)
+  {
+     ReInit(dstream, 1, 1, 4096);
+  }
+  public void ReInit(java.io.InputStream dstream, String encoding, int startline,
+                     int startcolumn) throws java.io.UnsupportedEncodingException
+  {
+     ReInit(dstream, encoding, startline, startcolumn, 4096);
+  }
+  public void ReInit(java.io.InputStream dstream, int startline,
+                     int startcolumn)
+  {
+     ReInit(dstream, startline, startcolumn, 4096);
+  }
+  public String GetImage()
+  {
+     if (bufpos >= tokenBegin)
+        return new String(buffer, tokenBegin, bufpos - tokenBegin + 1);
+     else
+        return new String(buffer, tokenBegin, bufsize - tokenBegin) +
+                              new String(buffer, 0, bufpos + 1);
+  }
+
+  public char[] GetSuffix(int len)
+  {
+     char[] ret = new char[len];
+
+     if ((bufpos + 1) >= len)
+        System.arraycopy(buffer, bufpos - len + 1, ret, 0, len);
+     else
+     {
+        System.arraycopy(buffer, bufsize - (len - bufpos - 1), ret, 0,
+                                                          len - bufpos - 1);
+        System.arraycopy(buffer, 0, ret, len - bufpos - 1, bufpos + 1);
+     }
+
+     return ret;
+  }
+
+  public void Done()
+  {
+     buffer = null;
+     bufline = null;
+     bufcolumn = null;
+  }
+
+  /**
+   * Method to adjust line and column numbers for the start of a token.
+   */
+  public void adjustBeginLineColumn(int newLine, int newCol)
+  {
+     int start = tokenBegin;
+     int len;
+
+     if (bufpos >= tokenBegin)
+     {
+        len = bufpos - tokenBegin + inBuf + 1;
+     }
+     else
+     {
+        len = bufsize - tokenBegin + bufpos + 1 + inBuf;
+     }
+
+     int i = 0, j = 0, k = 0;
+     int nextColDiff = 0, columnDiff = 0;
+
+     while (i < len &&
+            bufline[j = start % bufsize] == bufline[k = ++start % bufsize])
+     {
+        bufline[j] = newLine;
+        nextColDiff = columnDiff + bufcolumn[k] - bufcolumn[j];
+        bufcolumn[j] = newCol + columnDiff;
+        columnDiff = nextColDiff;
+        i++;
+     } 
+
+     if (i < len)
+     {
+        bufline[j] = newLine++;
+        bufcolumn[j] = newCol + columnDiff;
+
+        while (i++ < len)
+        {
+           if (bufline[j = start % bufsize] != bufline[++start % bufsize])
+              bufline[j] = newLine++;
+           else
+              bufline[j] = newLine;
+        }
+     }
+
+     line = bufline[j];
+     column = bufcolumn[j];
+  }
+
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserTokenManager.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserTokenManager.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserTokenManager.java	(revision 0)
@@ -0,0 +1,1799 @@
+/* Generated By:JavaCC: Do not edit this line. RTFParserTokenManager.java */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+import java.io.*;
+import java.util.*;
+
+public class RTFParserTokenManager implements RTFParserConstants
+{
+  public  java.io.PrintStream debugStream = System.out;
+  public  void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
+private final int jjStopStringLiteralDfa_2(int pos, long active0)
+{
+   switch (pos)
+   {
+      case 0:
+         if ((active0 & 0x7ff06L) != 0L)
+            return 1;
+         return -1;
+      default :
+         return -1;
+   }
+}
+private final int jjStartNfa_2(int pos, long active0)
+{
+   return jjMoveNfa_2(jjStopStringLiteralDfa_2(pos, active0), pos + 1);
+}
+private final int jjStopAtPos(int pos, int kind)
+{
+   jjmatchedKind = kind;
+   jjmatchedPos = pos;
+   return pos + 1;
+}
+private final int jjStartNfaWithStates_2(int pos, int kind, int state)
+{
+   jjmatchedKind = kind;
+   jjmatchedPos = pos;
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) { return pos + 1; }
+   return jjMoveNfa_2(state, pos + 1);
+}
+private final int jjMoveStringLiteralDfa0_2()
+{
+   switch(curChar)
+   {
+      case 92:
+         jjmatchedKind = 1;
+         return jjMoveStringLiteralDfa1_2(0x7ff04L);
+      case 123:
+         return jjStopAtPos(0, 6);
+      case 125:
+         return jjStopAtPos(0, 7);
+      default :
+         return jjMoveNfa_2(0, 0);
+   }
+}
+private final int jjMoveStringLiteralDfa1_2(long active0)
+{
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_2(0, active0);
+      return 1;
+   }
+   switch(curChar)
+   {
+      case 10:
+         if ((active0 & 0x800L) != 0L)
+            return jjStopAtPos(1, 11);
+         break;
+      case 13:
+         if ((active0 & 0x1000L) != 0L)
+            return jjStopAtPos(1, 12);
+         break;
+      case 39:
+         if ((active0 & 0x4L) != 0L)
+            return jjStopAtPos(1, 2);
+         break;
+      case 42:
+         if ((active0 & 0x2000L) != 0L)
+            return jjStopAtPos(1, 13);
+         break;
+      case 45:
+         if ((active0 & 0x200L) != 0L)
+            return jjStopAtPos(1, 9);
+         break;
+      case 58:
+         if ((active0 & 0x8000L) != 0L)
+            return jjStopAtPos(1, 15);
+         break;
+      case 92:
+         if ((active0 & 0x40000L) != 0L)
+            return jjStopAtPos(1, 18);
+         break;
+      case 95:
+         if ((active0 & 0x400L) != 0L)
+            return jjStopAtPos(1, 10);
+         break;
+      case 123:
+         if ((active0 & 0x10000L) != 0L)
+            return jjStopAtPos(1, 16);
+         break;
+      case 124:
+         if ((active0 & 0x4000L) != 0L)
+            return jjStopAtPos(1, 14);
+         break;
+      case 125:
+         if ((active0 & 0x20000L) != 0L)
+            return jjStopAtPos(1, 17);
+         break;
+      case 126:
+         if ((active0 & 0x100L) != 0L)
+            return jjStopAtPos(1, 8);
+         break;
+      default :
+         break;
+   }
+   return jjStartNfa_2(0, active0);
+}
+private final void jjCheckNAdd(int state)
+{
+   if (jjrounds[state] != jjround)
+   {
+      jjstateSet[jjnewStateCnt++] = state;
+      jjrounds[state] = jjround;
+   }
+}
+private final void jjAddStates(int start, int end)
+{
+   do {
+      jjstateSet[jjnewStateCnt++] = jjnextStates[start];
+   } while (start++ != end);
+}
+private final void jjCheckNAddTwoStates(int state1, int state2)
+{
+   jjCheckNAdd(state1);
+   jjCheckNAdd(state2);
+}
+private final void jjCheckNAddStates(int start, int end)
+{
+   do {
+      jjCheckNAdd(jjnextStates[start]);
+   } while (start++ != end);
+}
+private final void jjCheckNAddStates(int start)
+{
+   jjCheckNAdd(jjnextStates[start]);
+   jjCheckNAdd(jjnextStates[start + 1]);
+}
+static final long[] jjbitVec0 = {
+   0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
+};
+static final long[] jjbitVec2 = {
+   0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL
+};
+private final int jjMoveNfa_2(int startState, int curPos)
+{
+   int[] nextStates;
+   int startsAt = 0;
+   jjnewStateCnt = 3;
+   int i = 1;
+   jjstateSet[0] = startState;
+   int j, kind = 0x7fffffff;
+   for (;;)
+   {
+      if (++jjround == 0x7fffffff)
+         ReInitRounds();
+      if (curChar < 64)
+      {
+         long l = 1L << curChar;
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 0:
+               case 2:
+                  if ((0xffffffffffffd9ffL & l) == 0L)
+                     break;
+                  if (kind > 20)
+                     kind = 20;
+                  jjCheckNAdd(2);
+                  break;
+               case 1:
+                  if ((0xfc00fffeffffd9ffL & l) != 0L && kind > 19)
+                     kind = 19;
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else if (curChar < 128)
+      {
+         long l = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 0:
+                  if ((0xd7ffffffefffffffL & l) != 0L)
+                  {
+                     if (kind > 20)
+                        kind = 20;
+                     jjCheckNAdd(2);
+                  }
+                  else if (curChar == 92)
+                     jjstateSet[jjnewStateCnt++] = 1;
+                  break;
+               case 1:
+                  if ((0xd0000001e8000001L & l) != 0L && kind > 19)
+                     kind = 19;
+                  break;
+               case 2:
+                  if ((0xd7ffffffefffffffL & l) == 0L)
+                     break;
+                  if (kind > 20)
+                     kind = 20;
+                  jjCheckNAdd(2);
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else
+      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
+         int i2 = (curChar & 0xff) >> 6;
+         long l2 = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 0:
+               case 2:
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
+                     break;
+                  if (kind > 20)
+                     kind = 20;
+                  jjCheckNAdd(2);
+                  break;
+               case 1:
+                  if (jjCanMove_0(hiByte, i1, i2, l1, l2) && kind > 19)
+                     kind = 19;
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      if (kind != 0x7fffffff)
+      {
+         jjmatchedKind = kind;
+         jjmatchedPos = curPos;
+         kind = 0x7fffffff;
+      }
+      ++curPos;
+      if ((i = jjnewStateCnt) == (startsAt = 3 - (jjnewStateCnt = startsAt)))
+         return curPos;
+      try { curChar = input_stream.readChar(); }
+      catch(java.io.IOException e) { return curPos; }
+   }
+}
+private final int jjStopStringLiteralDfa_1(int pos, long active0)
+{
+   switch (pos)
+   {
+      default :
+         return -1;
+   }
+}
+private final int jjStartNfa_1(int pos, long active0)
+{
+   return jjMoveNfa_1(jjStopStringLiteralDfa_1(pos, active0), pos + 1);
+}
+private final int jjStartNfaWithStates_1(int pos, int kind, int state)
+{
+   jjmatchedKind = kind;
+   jjmatchedPos = pos;
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) { return pos + 1; }
+   return jjMoveNfa_1(state, pos + 1);
+}
+private final int jjMoveStringLiteralDfa0_1()
+{
+   switch(curChar)
+   {
+      case 92:
+         jjmatchedKind = 1;
+         return jjMoveStringLiteralDfa1_1(0x4L);
+      case 123:
+         return jjStopAtPos(0, 6);
+      case 125:
+         return jjStopAtPos(0, 7);
+      default :
+         return jjMoveNfa_1(0, 0);
+   }
+}
+private final int jjMoveStringLiteralDfa1_1(long active0)
+{
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_1(0, active0);
+      return 1;
+   }
+   switch(curChar)
+   {
+      case 39:
+         if ((active0 & 0x4L) != 0L)
+            return jjStopAtPos(1, 2);
+         break;
+      default :
+         break;
+   }
+   return jjStartNfa_1(0, active0);
+}
+private final int jjMoveNfa_1(int startState, int curPos)
+{
+   int[] nextStates;
+   int startsAt = 0;
+   jjnewStateCnt = 2;
+   int i = 1;
+   jjstateSet[0] = startState;
+   int j, kind = 0x7fffffff;
+   for (;;)
+   {
+      if (++jjround == 0x7fffffff)
+         ReInitRounds();
+      if (curChar < 64)
+      {
+         long l = 1L << curChar;
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 0:
+                  if ((0x3ff000000000000L & l) != 0L)
+                     jjstateSet[jjnewStateCnt++] = 1;
+                  break;
+               case 1:
+                  if ((0x3ff000000000000L & l) != 0L && kind > 22)
+                     kind = 22;
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else if (curChar < 128)
+      {
+         long l = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 0:
+                  if ((0x7e0000007eL & l) != 0L)
+                     jjstateSet[jjnewStateCnt++] = 1;
+                  break;
+               case 1:
+                  if ((0x7e0000007eL & l) != 0L && kind > 22)
+                     kind = 22;
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else
+      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
+         int i2 = (curChar & 0xff) >> 6;
+         long l2 = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      if (kind != 0x7fffffff)
+      {
+         jjmatchedKind = kind;
+         jjmatchedPos = curPos;
+         kind = 0x7fffffff;
+      }
+      ++curPos;
+      if ((i = jjnewStateCnt) == (startsAt = 2 - (jjnewStateCnt = startsAt)))
+         return curPos;
+      try { curChar = input_stream.readChar(); }
+      catch(java.io.IOException e) { return curPos; }
+   }
+}
+private final int jjStopStringLiteralDfa_0(int pos, long active0, long active1)
+{
+   switch (pos)
+   {
+      case 0:
+         if ((active0 & 0x800b8000000L) != 0L)
+            return 0;
+         if ((active0 & 0xfffff7ff40000000L) != 0L || (active1 & 0x7fffL) != 0L)
+         {
+            jjmatchedKind = 80;
+            return 0;
+         }
+         return -1;
+      case 1:
+         if ((active0 & 0x650000000L) != 0L)
+            return 0;
+         if ((active0 & 0xfffffff980000000L) != 0L || (active1 & 0x7fffL) != 0L)
+         {
+            if (jjmatchedPos != 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return 0;
+         }
+         return -1;
+      case 2:
+         if ((active0 & 0xb001c00000000L) != 0L)
+            return 0;
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xfff4ffe180000000L) != 0L || (active1 & 0x7ffeL) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 2;
+            return 0;
+         }
+         return -1;
+      case 3:
+         if ((active0 & 0x1401e000000000L) != 0L)
+            return 0;
+         if ((active1 & 0x3808L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xffe0fe0180000000L) != 0L || (active1 & 0x47f6L) != 0L)
+         {
+            if (jjmatchedPos != 3)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 3;
+            }
+            return 0;
+         }
+         return -1;
+      case 4:
+         if ((active0 & 0x100000000L) != 0L)
+            return 0;
+         if ((active1 & 0x3808L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4L) != 0L)
+         {
+            if (jjmatchedPos < 3)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 3;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xffe0fe4080000000L) != 0L || (active1 & 0x47f2L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 4;
+            return 0;
+         }
+         return -1;
+      case 5:
+         if ((active0 & 0xe60060000000000L) != 0L)
+            return 0;
+         if ((active1 & 0x3808L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4L) != 0L)
+         {
+            if (jjmatchedPos < 3)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 3;
+            }
+            return -1;
+         }
+         if ((active1 & 0x702L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xf180f84080000000L) != 0L || (active1 & 0x40f0L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 5;
+            return 0;
+         }
+         return -1;
+      case 6:
+         if ((active0 & 0x3180084000000000L) != 0L)
+            return 0;
+         if ((active1 & 0x3808L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4L) != 0L)
+         {
+            if (jjmatchedPos < 3)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 3;
+            }
+            return -1;
+         }
+         if ((active1 & 0x702L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xc000f00080000000L) != 0L || (active1 & 0x40f0L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 6;
+            return 0;
+         }
+         return -1;
+      case 7:
+         if ((active0 & 0x300080000000L) != 0L)
+            return 0;
+         if ((active1 & 0x3808L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4L) != 0L)
+         {
+            if (jjmatchedPos < 3)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 3;
+            }
+            return -1;
+         }
+         if ((active1 & 0x702L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         if ((active0 & 0xc000c00000000000L) != 0L || (active1 & 0x40f0L) != 0L)
+         {
+            if (jjmatchedPos != 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return 0;
+         }
+         return -1;
+      case 8:
+         if ((active0 & 0x800000000000L) != 0L || (active1 & 0x4010L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 8;
+            return 0;
+         }
+         if ((active0 & 0xc000400000000000L) != 0L)
+            return 0;
+         if ((active1 & 0xe0L) != 0L)
+         {
+            if (jjmatchedPos < 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1800L) != 0L)
+         {
+            if (jjmatchedPos < 2)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 2;
+            }
+            return -1;
+         }
+         if ((active1 & 0x702L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         if ((active1 & 0x1L) != 0L)
+         {
+            if (jjmatchedPos < 1)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 1;
+            }
+            return -1;
+         }
+         return -1;
+      case 9:
+         if ((active0 & 0x800000000000L) != 0L)
+            return 0;
+         if ((active1 & 0x4010L) != 0L)
+         {
+            if (jjmatchedPos != 9)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 9;
+            }
+            return 0;
+         }
+         if ((active1 & 0xe0L) != 0L)
+         {
+            if (jjmatchedPos < 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return -1;
+         }
+         if ((active1 & 0x702L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         return -1;
+      case 10:
+         if ((active1 & 0xe0L) != 0L)
+         {
+            if (jjmatchedPos < 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return -1;
+         }
+         if ((active1 & 0x300L) != 0L)
+         {
+            if (jjmatchedPos < 4)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 4;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4010L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 10;
+            return 0;
+         }
+         return -1;
+      case 11:
+         if ((active1 & 0x60L) != 0L)
+         {
+            if (jjmatchedPos < 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4010L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 11;
+            return 0;
+         }
+         return -1;
+      case 12:
+         if ((active1 & 0x40L) != 0L)
+         {
+            if (jjmatchedPos < 7)
+            {
+               jjmatchedKind = 80;
+               jjmatchedPos = 7;
+            }
+            return -1;
+         }
+         if ((active1 & 0x4010L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 12;
+            return 0;
+         }
+         return -1;
+      case 13:
+         if ((active1 & 0x10L) != 0L)
+            return 0;
+         if ((active1 & 0x4000L) != 0L)
+         {
+            jjmatchedKind = 80;
+            jjmatchedPos = 13;
+            return 0;
+         }
+         return -1;
+      default :
+         return -1;
+   }
+}
+private final int jjStartNfa_0(int pos, long active0, long active1)
+{
+   return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0, active1), pos + 1);
+}
+private final int jjStartNfaWithStates_0(int pos, int kind, int state)
+{
+   jjmatchedKind = kind;
+   jjmatchedPos = pos;
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) { return pos + 1; }
+   return jjMoveNfa_0(state, pos + 1);
+}
+private final int jjMoveStringLiteralDfa0_0()
+{
+   switch(curChar)
+   {
+      case 9:
+         return jjStopAtPos(0, 26);
+      case 10:
+         return jjStopAtPos(0, 24);
+      case 13:
+         return jjStopAtPos(0, 25);
+      case 32:
+         return jjStopAtPos(0, 23);
+      case 92:
+         jjmatchedKind = 1;
+         return jjMoveStringLiteralDfa1_0(0x4L, 0x0L);
+      case 97:
+         return jjMoveStringLiteralDfa1_0(0x6000000000L, 0x0L);
+      case 98:
+         return jjMoveStringLiteralDfa1_0(0x200000000000000L, 0x0L);
+      case 99:
+         return jjMoveStringLiteralDfa1_0(0x100040000000L, 0xfL);
+      case 100:
+         return jjMoveStringLiteralDfa1_0(0x8000000000L, 0x0L);
+      case 101:
+         return jjMoveStringLiteralDfa1_0(0x1e0000000000000L, 0x0L);
+      case 102:
+         jjmatchedKind = 29;
+         return jjMoveStringLiteralDfa1_0(0x80080000000L, 0x0L);
+      case 105:
+         return jjMoveStringLiteralDfa1_0(0x10000000000L, 0x0L);
+      case 108:
+         return jjMoveStringLiteralDfa1_0(0x5410400000000000L, 0x0L);
+      case 109:
+         return jjMoveStringLiteralDfa1_0(0x800000000L, 0x0L);
+      case 112:
+         return jjMoveStringLiteralDfa1_0(0x8240700000000L, 0x0L);
+      case 114:
+         return jjMoveStringLiteralDfa1_0(0xa800021000000000L, 0x0L);
+      case 115:
+         return jjMoveStringLiteralDfa1_0(0x800000000000L, 0x4000L);
+      case 116:
+         return jjMoveStringLiteralDfa1_0(0x1000000000000L, 0x3ff0L);
+      case 117:
+         jjmatchedKind = 27;
+         return jjMoveStringLiteralDfa1_0(0x10000000L, 0x0L);
+      case 122:
+         return jjMoveStringLiteralDfa1_0(0x6000000000000L, 0x0L);
+      case 123:
+         return jjStopAtPos(0, 6);
+      case 125:
+         return jjStopAtPos(0, 7);
+      default :
+         return jjMoveNfa_0(1, 0);
+   }
+}
+private final int jjMoveStringLiteralDfa1_0(long active0, long active1)
+{
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(0, active0, active1);
+      return 1;
+   }
+   switch(curChar)
+   {
+      case 39:
+         if ((active0 & 0x4L) != 0L)
+            return jjStopAtPos(1, 2);
+         break;
+      case 97:
+         return jjMoveStringLiteralDfa2_0(active0, 0x9000800000000L, active1, 0L);
+      case 99:
+         if ((active0 & 0x10000000L) != 0L)
+            return jjStartNfaWithStates_0(1, 28, 0);
+         else if ((active0 & 0x200000000L) != 0L)
+         {
+            jjmatchedKind = 33;
+            jjmatchedPos = 1;
+         }
+         return jjMoveStringLiteralDfa2_0(active0, 0x480000000L, active1, 0L);
+      case 100:
+         return jjMoveStringLiteralDfa2_0(active0, 0xc000000000000000L, active1, 0xf0L);
+      case 101:
+         return jjMoveStringLiteralDfa2_0(active0, 0x28000000000L, active1, 0x4000L);
+      case 105:
+         return jjMoveStringLiteralDfa2_0(active0, 0x10400000000000L, active1, 0L);
+      case 108:
+         return jjMoveStringLiteralDfa2_0(active0, 0x100000000L, active1, 0xfL);
+      case 109:
+         return jjMoveStringLiteralDfa2_0(active0, 0xa0000000000000L, active1, 0L);
+      case 110:
+         return jjMoveStringLiteralDfa2_0(active0, 0x140256000000000L, active1, 0L);
+      case 111:
+         return jjMoveStringLiteralDfa2_0(active0, 0x180000000000L, active1, 0L);
+      case 113:
+         return jjMoveStringLiteralDfa2_0(active0, 0xc00000000000000L, active1, 0L);
+      case 114:
+         return jjMoveStringLiteralDfa2_0(active0, 0L, active1, 0x3f00L);
+      case 115:
+         if ((active0 & 0x40000000L) != 0L)
+            return jjStartNfaWithStates_0(1, 30, 0);
+         break;
+      case 116:
+         return jjMoveStringLiteralDfa2_0(active0, 0x3000801000000000L, active1, 0L);
+      case 117:
+         return jjMoveStringLiteralDfa2_0(active0, 0x200000000000000L, active1, 0L);
+      case 119:
+         return jjMoveStringLiteralDfa2_0(active0, 0x6000000000000L, active1, 0L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(0, active0, active1);
+}
+private final int jjMoveStringLiteralDfa2_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(0, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(1, active0, active1);
+      return 2;
+   }
+   switch(curChar)
+   {
+      case 70:
+         return jjMoveStringLiteralDfa3_0(active0, 0L, active1, 0x1L);
+      case 78:
+         return jjMoveStringLiteralDfa3_0(active0, 0L, active1, 0x4L);
+      case 97:
+         if ((active0 & 0x400000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 34, 0);
+         return jjMoveStringLiteralDfa3_0(active0, 0x100000000L, active1, 0L);
+      case 98:
+         if ((active0 & 0x1000000000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 48, 0);
+         return jjMoveStringLiteralDfa3_0(active0, 0xc000000000000000L, active1, 0L);
+      case 99:
+         if ((active0 & 0x800000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 35, 0);
+         return jjMoveStringLiteralDfa3_0(active0, 0L, active1, 0x4000L);
+      case 100:
+         return jjMoveStringLiteralDfa3_0(active0, 0x60000000000000L, active1, 0L);
+      case 102:
+         if ((active0 & 0x1000000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 36, 0);
+         return jjMoveStringLiteralDfa3_0(active0, 0x18000000000L, active1, 0x7f2L);
+      case 104:
+         return jjMoveStringLiteralDfa3_0(active0, 0x80000000L, active1, 0L);
+      case 106:
+         if ((active0 & 0x2000000000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 49, 0);
+         break;
+      case 108:
+         return jjMoveStringLiteralDfa3_0(active0, 0x2200100000000000L, active1, 0L);
+      case 110:
+         return jjMoveStringLiteralDfa3_0(active0, 0x14080000000000L, active1, 0L);
+      case 114:
+         if ((active0 & 0x8000000000000L) != 0L)
+            return jjStartNfaWithStates_0(2, 51, 0);
+         return jjMoveStringLiteralDfa3_0(active0, 0x1000000000000000L, active1, 0L);
+      case 115:
+         return jjMoveStringLiteralDfa3_0(active0, 0x180606000000000L, active1, 0L);
+      case 116:
+         return jjMoveStringLiteralDfa3_0(active0, 0x40000000000L, active1, 0L);
+      case 117:
+         return jjMoveStringLiteralDfa3_0(active0, 0xc00000000000000L, active1, 0L);
+      case 118:
+         return jjMoveStringLiteralDfa3_0(active0, 0x20000000000L, active1, 0L);
+      case 119:
+         return jjMoveStringLiteralDfa3_0(active0, 0L, active1, 0x3808L);
+      case 121:
+         return jjMoveStringLiteralDfa3_0(active0, 0x800000000000L, active1, 0L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(1, active0, active1);
+}
+private final int jjMoveStringLiteralDfa3_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(1, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(2, active0, active1);
+      return 3;
+   }
+   switch(curChar)
+   {
+      case 87:
+         return jjMoveStringLiteralDfa4_0(active0, 0L, active1, 0x3808L);
+      case 97:
+         return jjMoveStringLiteralDfa4_0(active0, 0x60000080000000L, active1, 0L);
+      case 101:
+         if ((active0 & 0x10000000000000L) != 0L)
+            return jjStartNfaWithStates_0(3, 52, 0);
+         return jjMoveStringLiteralDfa4_0(active0, 0x240000000000L, active1, 0L);
+      case 102:
+         if ((active0 & 0x8000000000L) != 0L)
+            return jjStartNfaWithStates_0(3, 39, 0);
+         break;
+      case 105:
+         if ((active0 & 0x2000000000L) != 0L)
+         {
+            jjmatchedKind = 37;
+            jjmatchedPos = 3;
+         }
+         return jjMoveStringLiteralDfa4_0(active0, 0x4100000000L, active1, 0x1L);
+      case 106:
+         if ((active0 & 0x4000000000000L) != 0L)
+            return jjStartNfaWithStates_0(3, 50, 0);
+         break;
+      case 108:
+         return jjMoveStringLiteralDfa4_0(active0, 0xc200800000000000L, active1, 0L);
+      case 109:
+         return jjMoveStringLiteralDfa4_0(active0, 0x3000000000000000L, active1, 0L);
+      case 111:
+         if ((active0 & 0x10000000000L) != 0L)
+            return jjStartNfaWithStates_0(3, 40, 0);
+         return jjMoveStringLiteralDfa4_0(active0, 0xc00100000000000L, active1, 0x4L);
+      case 112:
+         return jjMoveStringLiteralDfa4_0(active0, 0x180000000000000L, active1, 0L);
+      case 114:
+         return jjMoveStringLiteralDfa4_0(active0, 0L, active1, 0xf0L);
+      case 116:
+         return jjMoveStringLiteralDfa4_0(active0, 0x4a0000000000L, active1, 0x4702L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(2, active0, active1);
+}
+private final int jjMoveStringLiteralDfa4_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(2, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(3, active0, active1);
+      return 4;
+   }
+   switch(curChar)
+   {
+      case 87:
+         return jjMoveStringLiteralDfa5_0(active0, 0L, active1, 0x4L);
+      case 97:
+         return jjMoveStringLiteralDfa5_0(active0, 0x3180000000000000L, active1, 0L);
+      case 98:
+         return jjMoveStringLiteralDfa5_0(active0, 0x20000000000L, active1, 0L);
+      case 99:
+         return jjMoveStringLiteralDfa5_0(active0, 0x204000000000L, active1, 0L);
+      case 101:
+         return jjMoveStringLiteralDfa5_0(active0, 0x200800000000000L, active1, 0L);
+      case 105:
+         return jjMoveStringLiteralDfa5_0(active0, 0L, active1, 0x3808L);
+      case 109:
+         return jjMoveStringLiteralDfa5_0(active0, 0L, active1, 0xf0L);
+      case 110:
+         if ((active0 & 0x100000000L) != 0L)
+            return jjStartNfaWithStates_0(4, 32, 0);
+         break;
+      case 113:
+         return jjMoveStringLiteralDfa5_0(active0, 0xc000000000000000L, active1, 0L);
+      case 114:
+         return jjMoveStringLiteralDfa5_0(active0, 0x100080000000L, active1, 0L);
+      case 115:
+         return jjMoveStringLiteralDfa5_0(active0, 0x60000000000000L, active1, 0x4702L);
+      case 116:
+         return jjMoveStringLiteralDfa5_0(active0, 0xc00480000000000L, active1, 0x1L);
+      case 120:
+         return jjMoveStringLiteralDfa5_0(active0, 0x40000000000L, active1, 0L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(3, active0, active1);
+}
+private final int jjMoveStringLiteralDfa5_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(3, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(4, active0, active1);
+      return 5;
+   }
+   switch(curChar)
+   {
+      case 84:
+         return jjMoveStringLiteralDfa6_0(active0, 0L, active1, 0x1L);
+      case 87:
+         return jjMoveStringLiteralDfa6_0(active0, 0L, active1, 0x702L);
+      case 97:
+         return jjMoveStringLiteralDfa6_0(active0, 0x400000000000L, active1, 0L);
+      case 98:
+         return jjMoveStringLiteralDfa6_0(active0, 0x80000000000L, active1, 0L);
+      case 99:
+         return jjMoveStringLiteralDfa6_0(active0, 0x180000000000000L, active1, 0L);
+      case 100:
+         return jjMoveStringLiteralDfa6_0(active0, 0L, active1, 0x3808L);
+      case 101:
+         if ((active0 & 0x400000000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 58, 0);
+         else if ((active0 & 0x800000000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 59, 0);
+         break;
+      case 104:
+         if ((active0 & 0x20000000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 53, 0);
+         else if ((active0 & 0x40000000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 54, 0);
+         break;
+      case 108:
+         if ((active0 & 0x20000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 41, 0);
+         return jjMoveStringLiteralDfa6_0(active0, 0x200000000000L, active1, 0L);
+      case 112:
+         return jjMoveStringLiteralDfa6_0(active0, 0x4000000000L, active1, 0x4000L);
+      case 114:
+         return jjMoveStringLiteralDfa6_0(active0, 0x3000000000000000L, active1, 0x4L);
+      case 115:
+         return jjMoveStringLiteralDfa6_0(active0, 0x800080000000L, active1, 0L);
+      case 116:
+         if ((active0 & 0x40000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 42, 0);
+         else if ((active0 & 0x200000000000000L) != 0L)
+            return jjStartNfaWithStates_0(5, 57, 0);
+         return jjMoveStringLiteralDfa6_0(active0, 0x100000000000L, active1, 0xf0L);
+      case 117:
+         return jjMoveStringLiteralDfa6_0(active0, 0xc000000000000000L, active1, 0L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(4, active0, active1);
+}
+private final int jjMoveStringLiteralDfa6_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(4, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(5, active0, active1);
+      return 6;
+   }
+   switch(curChar)
+   {
+      case 97:
+         return jjMoveStringLiteralDfa7_0(active0, 0L, active1, 0x4L);
+      case 98:
+         return jjMoveStringLiteralDfa7_0(active0, 0x500000000000L, active1, 0L);
+      case 101:
+         if ((active0 & 0x80000000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 55, 0);
+         else if ((active0 & 0x100000000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 56, 0);
+         return jjMoveStringLiteralDfa7_0(active0, 0x80000000L, active1, 0x4001L);
+      case 103:
+         if ((active0 & 0x4000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 38, 0);
+         break;
+      case 104:
+         return jjMoveStringLiteralDfa7_0(active0, 0x800000000000L, active1, 0L);
+      case 105:
+         return jjMoveStringLiteralDfa7_0(active0, 0L, active1, 0x702L);
+      case 107:
+         if ((active0 & 0x1000000000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 60, 0);
+         else if ((active0 & 0x2000000000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 61, 0);
+         break;
+      case 108:
+         if ((active0 & 0x80000000000L) != 0L)
+            return jjStartNfaWithStates_0(6, 43, 0);
+         break;
+      case 111:
+         return jjMoveStringLiteralDfa7_0(active0, 0xc000000000000000L, active1, 0L);
+      case 116:
+         return jjMoveStringLiteralDfa7_0(active0, 0L, active1, 0x3808L);
+      case 118:
+         return jjMoveStringLiteralDfa7_0(active0, 0x200000000000L, active1, 0L);
+      case 120:
+         return jjMoveStringLiteralDfa7_0(active0, 0L, active1, 0xf0L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(5, active0, active1);
+}
+private final int jjMoveStringLiteralDfa7_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(5, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(6, active0, active1);
+      return 7;
+   }
+   switch(curChar)
+   {
+      case 99:
+         return jjMoveStringLiteralDfa8_0(active0, 0L, active1, 0x4000L);
+      case 100:
+         return jjMoveStringLiteralDfa8_0(active0, 0L, active1, 0x702L);
+      case 101:
+         return jjMoveStringLiteralDfa8_0(active0, 0x800000000000L, active1, 0L);
+      case 104:
+         if ((active1 & 0x8L) != 0L)
+            return jjStopAtPos(7, 67);
+         else if ((active1 & 0x2000L) != 0L)
+         {
+            jjmatchedKind = 77;
+            jjmatchedPos = 7;
+         }
+         return jjMoveStringLiteralDfa8_0(active0, 0L, active1, 0x1800L);
+      case 108:
+         if ((active0 & 0x100000000000L) != 0L)
+            return jjStartNfaWithStates_0(7, 44, 0);
+         else if ((active0 & 0x200000000000L) != 0L)
+            return jjStartNfaWithStates_0(7, 45, 0);
+         return jjMoveStringLiteralDfa8_0(active0, 0x400000000000L, active1, 0L);
+      case 112:
+         if ((active1 & 0x4L) != 0L)
+            return jjStopAtPos(7, 66);
+         break;
+      case 116:
+         if ((active0 & 0x80000000L) != 0L)
+            return jjStartNfaWithStates_0(7, 31, 0);
+         return jjMoveStringLiteralDfa8_0(active0, 0xc000000000000000L, active1, 0xf0L);
+      case 120:
+         return jjMoveStringLiteralDfa8_0(active0, 0L, active1, 0x1L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(6, active0, active1);
+}
+private final int jjMoveStringLiteralDfa8_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(6, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(7, active0, active1);
+      return 8;
+   }
+   switch(curChar)
+   {
+      case 65:
+         if ((active1 & 0x800L) != 0L)
+            return jjStopAtPos(8, 75);
+         break;
+      case 66:
+         if ((active1 & 0x1000L) != 0L)
+            return jjStopAtPos(8, 76);
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x10L);
+      case 76:
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x20L);
+      case 82:
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x40L);
+      case 84:
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x80L);
+      case 101:
+         if ((active0 & 0x400000000000L) != 0L)
+            return jjStartNfaWithStates_0(8, 46, 0);
+         else if ((active0 & 0x4000000000000000L) != 0L)
+            return jjStartNfaWithStates_0(8, 62, 0);
+         else if ((active0 & 0x8000000000000000L) != 0L)
+            return jjStartNfaWithStates_0(8, 63, 0);
+         return jjMoveStringLiteralDfa9_0(active0, 0x800000000000L, active1, 0L);
+      case 105:
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x4000L);
+      case 116:
+         if ((active1 & 0x1L) != 0L)
+            return jjStopAtPos(8, 64);
+         return jjMoveStringLiteralDfa9_0(active0, 0L, active1, 0x702L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(7, active0, active1);
+}
+private final int jjMoveStringLiteralDfa9_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(7, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(8, active0, active1);
+      return 9;
+   }
+   switch(curChar)
+   {
+      case 101:
+         return jjMoveStringLiteralDfa10_0(active0, 0L, active1, 0x20L);
+      case 102:
+         return jjMoveStringLiteralDfa10_0(active0, 0L, active1, 0x4000L);
+      case 104:
+         if ((active1 & 0x2L) != 0L)
+            return jjStopAtPos(9, 65);
+         else if ((active1 & 0x400L) != 0L)
+         {
+            jjmatchedKind = 74;
+            jjmatchedPos = 9;
+         }
+         return jjMoveStringLiteralDfa10_0(active0, 0L, active1, 0x300L);
+      case 105:
+         return jjMoveStringLiteralDfa10_0(active0, 0L, active1, 0x40L);
+      case 111:
+         return jjMoveStringLiteralDfa10_0(active0, 0L, active1, 0x90L);
+      case 116:
+         if ((active0 & 0x800000000000L) != 0L)
+            return jjStartNfaWithStates_0(9, 47, 0);
+         break;
+      default :
+         break;
+   }
+   return jjStartNfa_0(8, active0, active1);
+}
+private final int jjMoveStringLiteralDfa10_0(long old0, long active0, long old1, long active1)
+{
+   if (((active0 &= old0) | (active1 &= old1)) == 0L)
+      return jjStartNfa_0(8, old0, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(9, 0L, active1);
+      return 10;
+   }
+   switch(curChar)
+   {
+      case 65:
+         if ((active1 & 0x100L) != 0L)
+            return jjStopAtPos(10, 72);
+         break;
+      case 66:
+         if ((active1 & 0x200L) != 0L)
+            return jjStopAtPos(10, 73);
+         break;
+      case 102:
+         return jjMoveStringLiteralDfa11_0(active1, 0x20L);
+      case 103:
+         return jjMoveStringLiteralDfa11_0(active1, 0x40L);
+      case 112:
+         if ((active1 & 0x80L) != 0L)
+            return jjStopAtPos(10, 71);
+         break;
+      case 116:
+         return jjMoveStringLiteralDfa11_0(active1, 0x10L);
+      case 121:
+         return jjMoveStringLiteralDfa11_0(active1, 0x4000L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(9, 0L, active1);
+}
+private final int jjMoveStringLiteralDfa11_0(long old1, long active1)
+{
+   if (((active1 &= old1)) == 0L)
+      return jjStartNfa_0(9, 0L, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(10, 0L, active1);
+      return 11;
+   }
+   switch(curChar)
+   {
+      case 103:
+         return jjMoveStringLiteralDfa12_0(active1, 0x4000L);
+      case 104:
+         return jjMoveStringLiteralDfa12_0(active1, 0x40L);
+      case 116:
+         if ((active1 & 0x20L) != 0L)
+            return jjStopAtPos(11, 69);
+         return jjMoveStringLiteralDfa12_0(active1, 0x10L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(10, 0L, active1);
+}
+private final int jjMoveStringLiteralDfa12_0(long old1, long active1)
+{
+   if (((active1 &= old1)) == 0L)
+      return jjStartNfa_0(10, 0L, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(11, 0L, active1);
+      return 12;
+   }
+   switch(curChar)
+   {
+      case 101:
+         return jjMoveStringLiteralDfa13_0(active1, 0x4000L);
+      case 111:
+         return jjMoveStringLiteralDfa13_0(active1, 0x10L);
+      case 116:
+         if ((active1 & 0x40L) != 0L)
+            return jjStopAtPos(12, 70);
+         break;
+      default :
+         break;
+   }
+   return jjStartNfa_0(11, 0L, active1);
+}
+private final int jjMoveStringLiteralDfa13_0(long old1, long active1)
+{
+   if (((active1 &= old1)) == 0L)
+      return jjStartNfa_0(11, 0L, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(12, 0L, active1);
+      return 13;
+   }
+   switch(curChar)
+   {
+      case 109:
+         if ((active1 & 0x10L) != 0L)
+            return jjStartNfaWithStates_0(13, 68, 0);
+         break;
+      case 110:
+         return jjMoveStringLiteralDfa14_0(active1, 0x4000L);
+      default :
+         break;
+   }
+   return jjStartNfa_0(12, 0L, active1);
+}
+private final int jjMoveStringLiteralDfa14_0(long old1, long active1)
+{
+   if (((active1 &= old1)) == 0L)
+      return jjStartNfa_0(12, 0L, old1); 
+   try { curChar = input_stream.readChar(); }
+   catch(java.io.IOException e) {
+      jjStopStringLiteralDfa_0(13, 0L, active1);
+      return 14;
+   }
+   switch(curChar)
+   {
+      case 78:
+         if ((active1 & 0x4000L) != 0L)
+            return jjStartNfaWithStates_0(14, 78, 0);
+         break;
+      default :
+         break;
+   }
+   return jjStartNfa_0(13, 0L, active1);
+}
+private final int jjMoveNfa_0(int startState, int curPos)
+{
+   int[] nextStates;
+   int startsAt = 0;
+   jjnewStateCnt = 3;
+   int i = 1;
+   jjstateSet[0] = startState;
+   int j, kind = 0x7fffffff;
+   for (;;)
+   {
+      if (++jjround == 0x7fffffff)
+         ReInitRounds();
+      if (curChar < 64)
+      {
+         long l = 1L << curChar;
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 1:
+                  if ((0x3ff000000000000L & l) != 0L)
+                  {
+                     if (kind > 82)
+                        kind = 82;
+                     jjCheckNAdd(2);
+                  }
+                  else if (curChar == 45)
+                     jjCheckNAdd(2);
+                  break;
+               case 2:
+                  if ((0x3ff000000000000L & l) == 0L)
+                     break;
+                  kind = 82;
+                  jjCheckNAdd(2);
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else if (curChar < 128)
+      {
+         long l = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               case 1:
+               case 0:
+                  if ((0x7fffffe00084004L & l) == 0L)
+                     break;
+                  kind = 80;
+                  jjCheckNAdd(0);
+                  break;
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      else
+      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
+         int i2 = (curChar & 0xff) >> 6;
+         long l2 = 1L << (curChar & 077);
+         MatchLoop: do
+         {
+            switch(jjstateSet[--i])
+            {
+               default : break;
+            }
+         } while(i != startsAt);
+      }
+      if (kind != 0x7fffffff)
+      {
+         jjmatchedKind = kind;
+         jjmatchedPos = curPos;
+         kind = 0x7fffffff;
+      }
+      ++curPos;
+      if ((i = jjnewStateCnt) == (startsAt = 3 - (jjnewStateCnt = startsAt)))
+         return curPos;
+      try { curChar = input_stream.readChar(); }
+      catch(java.io.IOException e) { return curPos; }
+   }
+}
+static final int[] jjnextStates = {
+};
+private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
+{
+   switch(hiByte)
+   {
+      case 0:
+         return ((jjbitVec2[i2] & l2) != 0L);
+      default : 
+         if ((jjbitVec0[i1] & l1) != 0L)
+            return true;
+         return false;
+   }
+}
+public static final String[] jjstrLiteralImages = {
+null, null, null, null, null, null, null, null, "\134\176", "\134\55", 
+"\134\137", "\134\12", "\134\15", "\134\52", "\134\174", "\134\72", "\134\173", 
+"\134\175", "\134\134", null, null, null, null, null, null, null, null, null, null, null, 
+null, null, null, null, null, null, null, null, null, null, null, null, null, null, 
+null, null, null, null, null, null, null, null, null, null, null, null, null, null, 
+null, null, null, null, null, null, null, null, null, null, null, null, null, null, 
+null, null, null, null, null, null, null, null, null, null, null, null, };
+public static final String[] lexStateNames = {
+   "CONTROL", 
+   "HEX", 
+   "DEFAULT", 
+};
+public static final int[] jjnewLexState = {
+   -1, 0, 1, -1, -1, -1, 2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 2, 2, 
+   2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
+   -1, -1, -1, -1, -1, -1, -1, -1, 2, 
+};
+static final long[] jjtoToken = {
+   0xfffffffff85fffc1L, 0x57fffL, 
+};
+static final long[] jjtoSkip = {
+   0x7800038L, 0x80000L, 
+};
+static final long[] jjtoMore = {
+   0x6L, 0x0L, 
+};
+protected SimpleCharStream input_stream;
+private final int[] jjrounds = new int[3];
+private final int[] jjstateSet = new int[6];
+StringBuffer image;
+int jjimageLen;
+int lengthOfMatch;
+protected char curChar;
+public RTFParserTokenManager(SimpleCharStream stream){
+   if (SimpleCharStream.staticFlag)
+      throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
+   input_stream = stream;
+}
+public RTFParserTokenManager(SimpleCharStream stream, int lexState){
+   this(stream);
+   SwitchTo(lexState);
+}
+public void ReInit(SimpleCharStream stream)
+{
+   jjmatchedPos = jjnewStateCnt = 0;
+   curLexState = defaultLexState;
+   input_stream = stream;
+   ReInitRounds();
+}
+private final void ReInitRounds()
+{
+   int i;
+   jjround = 0x80000001;
+   for (i = 3; i-- > 0;)
+      jjrounds[i] = 0x80000000;
+}
+public void ReInit(SimpleCharStream stream, int lexState)
+{
+   ReInit(stream);
+   SwitchTo(lexState);
+}
+public void SwitchTo(int lexState)
+{
+   if (lexState >= 3 || lexState < 0)
+      throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE);
+   else
+      curLexState = lexState;
+}
+
+protected Token jjFillToken()
+{
+   Token t = Token.newToken(jjmatchedKind);
+   t.kind = jjmatchedKind;
+   String im = jjstrLiteralImages[jjmatchedKind];
+   t.image = (im == null) ? input_stream.GetImage() : im;
+   t.beginLine = input_stream.getBeginLine();
+   t.beginColumn = input_stream.getBeginColumn();
+   t.endLine = input_stream.getEndLine();
+   t.endColumn = input_stream.getEndColumn();
+   return t;
+}
+
+int curLexState = 2;
+int defaultLexState = 2;
+int jjnewStateCnt;
+int jjround;
+int jjmatchedPos;
+int jjmatchedKind;
+
+public Token getNextToken() 
+{
+  int kind;
+  Token specialToken = null;
+  Token matchedToken;
+  int curPos = 0;
+
+  EOFLoop :
+  for (;;)
+  {   
+   try   
+   {     
+      curChar = input_stream.BeginToken();
+   }     
+   catch(java.io.IOException e)
+   {        
+      jjmatchedKind = 0;
+      matchedToken = jjFillToken();
+      return matchedToken;
+   }
+   image = null;
+   jjimageLen = 0;
+
+   for (;;)
+   {
+     switch(curLexState)
+     {
+       case 0:
+         jjmatchedKind = 0x7fffffff;
+         jjmatchedPos = 0;
+         curPos = jjMoveStringLiteralDfa0_0();
+         if (jjmatchedPos == 0 && jjmatchedKind > 83)
+         {
+            jjmatchedKind = 83;
+         }
+         break;
+       case 1:
+         jjmatchedKind = 0x7fffffff;
+         jjmatchedPos = 0;
+         curPos = jjMoveStringLiteralDfa0_1();
+         break;
+       case 2:
+         try { input_stream.backup(0);
+            while (curChar <= 13 && (0x2600L & (1L << curChar)) != 0L)
+               curChar = input_stream.BeginToken();
+         }
+         catch (java.io.IOException e1) { continue EOFLoop; }
+         jjmatchedKind = 0x7fffffff;
+         jjmatchedPos = 0;
+         curPos = jjMoveStringLiteralDfa0_2();
+         break;
+     }
+     if (jjmatchedKind != 0x7fffffff)
+     {
+        if (jjmatchedPos + 1 < curPos)
+           input_stream.backup(curPos - jjmatchedPos - 1);
+        if ((jjtoToken[jjmatchedKind >> 6] & (1L << (jjmatchedKind & 077))) != 0L)
+        {
+           matchedToken = jjFillToken();
+           TokenLexicalActions(matchedToken);
+       if (jjnewLexState[jjmatchedKind] != -1)
+         curLexState = jjnewLexState[jjmatchedKind];
+           return matchedToken;
+        }
+        else if ((jjtoSkip[jjmatchedKind >> 6] & (1L << (jjmatchedKind & 077))) != 0L)
+        {
+         if (jjnewLexState[jjmatchedKind] != -1)
+           curLexState = jjnewLexState[jjmatchedKind];
+           continue EOFLoop;
+        }
+        jjimageLen += jjmatchedPos + 1;
+      if (jjnewLexState[jjmatchedKind] != -1)
+        curLexState = jjnewLexState[jjmatchedKind];
+        curPos = 0;
+        jjmatchedKind = 0x7fffffff;
+        try {
+           curChar = input_stream.readChar();
+           continue;
+        }
+        catch (java.io.IOException e1) { }
+     }
+     int error_line = input_stream.getEndLine();
+     int error_column = input_stream.getEndColumn();
+     String error_after = null;
+     boolean EOFSeen = false;
+     try { input_stream.readChar(); input_stream.backup(1); }
+     catch (java.io.IOException e1) {
+        EOFSeen = true;
+        error_after = curPos <= 1 ? "" : input_stream.GetImage();
+        if (curChar == '\n' || curChar == '\r') {
+           error_line++;
+           error_column = 0;
+        }
+        else
+           error_column++;
+     }
+     if (!EOFSeen) {
+        input_stream.backup(1);
+        error_after = curPos <= 1 ? "" : input_stream.GetImage();
+     }
+     throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR);
+   }
+  }
+}
+
+void TokenLexicalActions(Token matchedToken)
+{
+   switch(jjmatchedKind)
+   {
+      case 8 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[8]);
+                                 matchedToken.image = "\u00a0";
+         break;
+      case 9 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[9]);
+                                 matchedToken.image = "\u00ad";
+         break;
+      case 10 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[10]);
+                                 matchedToken.image = "\u2011";
+         break;
+      case 16 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[16]);
+                                matchedToken.image = "{";
+         break;
+      case 17 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[17]);
+                                matchedToken.image = "}";
+         break;
+      case 18 :
+        if (image == null)
+            image = new StringBuffer();
+            image.append(jjstrLiteralImages[18]);
+                                matchedToken.image = "\\";
+         break;
+      default : 
+         break;
+   }
+}
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java	(revision 747309)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserDelegateImpl.java	(working copy)
@@ -17,9 +17,6 @@
 
 package org.apache.nutch.parse.rtf;
 
-// RTF Parser imports
-import com.etranslate.tm.processing.rtf.RTFParserDelegate;
-
 // JDK imports
 import java.util.Arrays;
 import java.util.List;
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserConstants.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserConstants.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParserConstants.java	(revision 0)
@@ -0,0 +1,189 @@
+/* Generated By:JavaCC: Do not edit this line. RTFParserConstants.java */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+public interface RTFParserConstants {
+
+  int EOF = 0;
+  int BACKSLASH = 1;
+  int HEX_ESCAPE = 2;
+  int LBRACE = 6;
+  int RBRACE = 7;
+  int NON_BREAKING_SPACE = 8;
+  int OPTIONAL_HYPHEN = 9;
+  int NON_BREAKING_HYPHEN = 10;
+  int ESCAPED_NEWLINE = 11;
+  int ESCAPED_CARRIAGE_RETURN = 12;
+  int IGNORABLE_DESTINATION = 13;
+  int FORMULA_CHARACTER = 14;
+  int INDEX_SUBENTRY = 15;
+  int ESCAPED_LBRACE = 16;
+  int ESCAPED_RBRACE = 17;
+  int ESCAPED_BACKSLASH = 18;
+  int CONTROL_SYM = 19;
+  int TEXT = 20;
+  int HEX_DIGIT = 21;
+  int HEX_CHAR = 22;
+  int U = 27;
+  int UC = 28;
+  int F = 29;
+  int CS = 30;
+  int FCHARSET = 31;
+  int PLAIN = 32;
+  int PC = 33;
+  int PCA = 34;
+  int MAC = 35;
+  int RTF = 36;
+  int ANSI = 37;
+  int ANSICPG = 38;
+  int DEFF = 39;
+  int INFO = 40;
+  int REVTBL = 41;
+  int PNTEXT = 42;
+  int FONTTBL = 43;
+  int COLORTBL = 44;
+  int PNSECLVL = 45;
+  int LISTTABLE = 46;
+  int STYLESHEET = 47;
+  int TAB = 48;
+  int ZWJ = 49;
+  int ZWNJ = 50;
+  int PAR = 51;
+  int LINE = 52;
+  int EMDASH = 53;
+  int ENDASH = 54;
+  int EMSPACE = 55;
+  int ENSPACE = 56;
+  int BULLET = 57;
+  int LQUOTE = 58;
+  int RQUOTE = 59;
+  int LTRMARK = 60;
+  int RTLMARK = 61;
+  int LDBLQUOTE = 62;
+  int RDBLQUOTE = 63;
+  int CLFITTEXT = 64;
+  int CLFTSWIDTH = 65;
+  int CLNOWRAP = 66;
+  int CLWWIDTH = 67;
+  int TDFRMTXTBOTTOM = 68;
+  int TDFRMTXTLEFT = 69;
+  int TDFRMTXTRIGHT = 70;
+  int TDFRMTXTTOP = 71;
+  int TRFTSWIDTHA = 72;
+  int TRFTSWIDTHB = 73;
+  int TRFTSWIDTH = 74;
+  int TRWWIDTHA = 75;
+  int TRWWIDTHB = 76;
+  int TRWWIDTH = 77;
+  int SECTSPECIFYGENN = 78;
+  int LC_LETTER = 79;
+  int CONTROL_WORD = 80;
+  int DIGIT = 81;
+  int CW_VAL = 82;
+
+  int CONTROL = 0;
+  int HEX = 1;
+  int DEFAULT = 2;
+
+  String[] tokenImage = {
+    "<EOF>",
+    "\"\\\\\"",
+    "\"\\\\\\\'\"",
+    "\"\\n\"",
+    "\"\\r\"",
+    "\"\\t\"",
+    "\"{\"",
+    "\"}\"",
+    "\"\\\\~\"",
+    "\"\\\\-\"",
+    "\"\\\\_\"",
+    "\"\\\\\\n\"",
+    "\"\\\\\\r\"",
+    "\"\\\\*\"",
+    "\"\\\\|\"",
+    "\"\\\\:\"",
+    "\"\\\\{\"",
+    "\"\\\\}\"",
+    "\"\\\\\\\\\"",
+    "<CONTROL_SYM>",
+    "<TEXT>",
+    "<HEX_DIGIT>",
+    "<HEX_CHAR>",
+    "\" \"",
+    "\"\\n\"",
+    "\"\\r\"",
+    "\"\\t\"",
+    "\"u\"",
+    "\"uc\"",
+    "\"f\"",
+    "\"cs\"",
+    "\"fcharset\"",
+    "\"plain\"",
+    "\"pc\"",
+    "\"pca\"",
+    "\"mac\"",
+    "\"rtf\"",
+    "\"ansi\"",
+    "\"ansicpg\"",
+    "\"deff\"",
+    "\"info\"",
+    "\"revtbl\"",
+    "\"pntext\"",
+    "\"fonttbl\"",
+    "\"colortbl\"",
+    "\"pnseclvl\"",
+    "\"listtable\"",
+    "\"stylesheet\"",
+    "\"tab\"",
+    "\"zwj\"",
+    "\"zwnj\"",
+    "\"par\"",
+    "\"line\"",
+    "\"emdash\"",
+    "\"endash\"",
+    "\"emspace\"",
+    "\"enspace\"",
+    "\"bullet\"",
+    "\"lquote\"",
+    "\"rquote\"",
+    "\"ltrmark\"",
+    "\"rtlmark\"",
+    "\"ldblquote\"",
+    "\"rdblquote\"",
+    "\"clFitText\"",
+    "\"clftsWidth\"",
+    "\"clNoWrap\"",
+    "\"clwWidth\"",
+    "\"tdfrmtxtBottom\"",
+    "\"tdfrmtxtLeft\"",
+    "\"tdfrmtxtRight\"",
+    "\"tdfrmtxtTop\"",
+    "\"trftsWidthA\"",
+    "\"trftsWidthB\"",
+    "\"trftsWidth\"",
+    "\"trwWidthA\"",
+    "\"trwWidthB\"",
+    "\"trwWidth\"",
+    "\"sectspecifygenN\"",
+    "<LC_LETTER>",
+    "<CONTROL_WORD>",
+    "<DIGIT>",
+    "<CW_VAL>",
+    "<token of kind 83>",
+  };
+
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.java	(revision 0)
@@ -0,0 +1,1468 @@
+/* Generated By:JavaCC: Do not edit this line. RTFParser.java */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>RTFParser</p>
+ * <p>Some methods of this are generated by javaCC</p>
+ * @author &lt;eric@etranslate.com&gt;
+ * @author Roman Puchkovskiy <a href="mailto:roman.puchkovskiy@blandware.com">
+ * &lt;roman.puchkovskiy@blandware.com&gt;</a>
+ * @version $Revision: 1.1 $ $Date: 2005/03/26 14:26:06 $
+ */
+public class RTFParser implements RTFParserDelegate, RTFParserConstants {
+
+  /* maps windows character sets to java encoding names */
+  /* note: sparse array */
+  private static final String[] CHARSET_ENCODING_TABLE = new String[255];
+  static {
+        CHARSET_ENCODING_TABLE[0] = "Cp1252";     // ANSI
+        CHARSET_ENCODING_TABLE[1] = "Cp1252";     // Default
+        CHARSET_ENCODING_TABLE[2] = "Cp1252";     // Symbol
+        CHARSET_ENCODING_TABLE[3] = null;         // Invalid
+        CHARSET_ENCODING_TABLE[77] = "MacRoman";  // Mac
+        CHARSET_ENCODING_TABLE[128] = "MS932";    // Shift JIS
+        CHARSET_ENCODING_TABLE[129] = "MS949";    // Hangul
+        CHARSET_ENCODING_TABLE[130] = "Johab";    // Johab
+        CHARSET_ENCODING_TABLE[134] = "MS936";    // GB2312
+        CHARSET_ENCODING_TABLE[136] = "MS950";    // Big5
+        CHARSET_ENCODING_TABLE[161] = "Cp1253";   // Greek
+        CHARSET_ENCODING_TABLE[162] = "Cp1254";   // Turkish
+        CHARSET_ENCODING_TABLE[163] = "Cp1258";   // Vietnamese
+        CHARSET_ENCODING_TABLE[177] = "Cp1255";   // Hebrew
+        CHARSET_ENCODING_TABLE[178] = "Cp1256";   // Arabic
+        CHARSET_ENCODING_TABLE[179] = "Cp1256";   // Arabic Traditional
+        CHARSET_ENCODING_TABLE[180] = "Cp1256";   // Arabic User
+        CHARSET_ENCODING_TABLE[181] = "Cp1255";   // Hebrew User
+        CHARSET_ENCODING_TABLE[186] = "Cp1257";   // Baltic
+        CHARSET_ENCODING_TABLE[204] = "Cp866";    // Russian
+        CHARSET_ENCODING_TABLE[222] = "MS874";    // Thai
+        CHARSET_ENCODING_TABLE[238] = "Cp1250";   // East European
+        CHARSET_ENCODING_TABLE[254] = "Cp437";    // PC 437
+  }
+
+  /*
+   * These next two tables map windows codepages to java encoding names.
+   * The codepage ints are too large to do a sparse array, so we have
+   * two parallel arrays and do a binary search to find the common offset.
+   */
+
+  private static final int[] RTF_CODEPAGE = {
+        437, // United States IBM 
+
+        /*  Not supported by JDK 1.3.1
+        708, // Arabic (ASMO 708) 
+        709, // Arabic (ASMO 449+, BCON V4) 
+        710, // Arabic (transparent Arabic) 
+        711, // Arabic (Nafitha Enhanced) 
+        720, // Arabic (transparent ASMO) 
+        */
+
+        819, // Windows 3.1 (United States and Western Europe) 
+        850, // IBM multilingual 
+        852, // Eastern European 
+        860, // Portuguese 
+        862, // Hebrew 
+        863, // French Canadian 
+        864, // Arabic 
+        865, // Norwegian 
+        866, // Soviet Union 
+        874, // Thai 
+        932, // Japanese 
+        936, // Simplified Chinese 
+        949, // Korean 
+        950, // Traditional Chinese 
+        1250, // Windows 3.1 (Eastern European) 
+        1251, // Windows 3.1 (Cyrillic) 
+        1252, // Western European 
+        1253, // Greek 
+        1254, // Turkish 
+        1255, // Hebrew 
+        1256, // Arabic 
+        1257, // Baltic 
+        1258, // Vietnamese 
+        1361  // Johab
+  };
+
+  private static final String[] JAVA_ENCODINGS = {
+        "Cp437", // United States IBM 
+        /*  Not supported by JDK 1.3.1
+        "Cp708", // Arabic (ASMO 708) 
+        "Cp709", // Arabic (ASMO 449+, BCON V4) 
+        "Cp710", // Arabic (transparent Arabic) 
+        "Cp711", // Arabic (Nafitha Enhanced) 
+        "Cp720", // Arabic (transparent ASMO) 
+        */
+        "Cp819", // Windows 3.1 (United States and Western Europe) 
+        "Cp850", // IBM multilingual 
+        "Cp852", // Eastern European 
+        "Cp860", // Portuguese 
+        "Cp862", // Hebrew 
+        "Cp863", // French Canadian 
+        "Cp864", // Arabic 
+        "Cp865", // Norwegian 
+        "Cp866", // Soviet Union 
+        "MS874", // Thai 
+        "MS932", // Japanese 
+        "MS936", // Simplified Chinese 
+        "MS949", // Korean 
+        "MS950", // Traditional Chinese 
+        "Cp1250", // Windows 3.1 (Eastern European) 
+        "Cp1251", // Windows 3.1 (Cyrillic) 
+        "Cp1252", // Western European 
+        "Cp1253", // Greek 
+        "Cp1254", // Turkish 
+        "Cp1255", // Hebrew 
+        "Cp1256", // Arabic 
+        "Cp1257", // Baltic 
+        "Cp1258", // Vietnamese 
+        "Johab"  // Johab
+  };
+
+  /**
+   * Searches RTF_CODEPAGE table for the offset of rtfCodepage and returns
+   * the corresponding encoding name from the JAVA_ENCODINGS table, or
+   * null if none is present.
+   */
+  private static final String getJavaEncoding(int rtfCodepage) {
+    int offset = Arrays.binarySearch(RTF_CODEPAGE, rtfCodepage);
+    return offset < 0 ? null : JAVA_ENCODINGS[offset];
+  }
+
+  /* support for skipping bytes after a unicode character.
+   * TODO: handle \bin
+   */
+  // the default number of bytes to skip after a unicode character
+  private static final Integer DEFAULT_SKIP_STATE = new Integer(1);
+  // the current number of bytes to skip after a unicode character
+  private Integer _currentSkipState = DEFAULT_SKIP_STATE;
+  // a stack of skip states for bytes following a unicode character
+  private final Stack _ucSkipStates = new Stack();
+
+  // the default encoding for all RTF documents
+  private static final String DEFAULT_ENCODING = "Cp1252";
+  // the document encoding for this RTF document
+  private String _documentEncoding = DEFAULT_ENCODING;
+
+  /* support for parsing the \fonttbl to discover font codes and
+   * their assigned encodings
+   */
+  // this holds the font table key (\fN) while we're waiting for the
+  // charset (\fcharsetN) declaration in the font table.
+  private int _currentFontValue = 0;
+  // this maps font codes (\fN) to the encodings assigned (\fcharsetN)
+  // in the fonttbl
+  private final Map _fontEncodingMap = new HashMap();
+
+  /** support for encoding changes via references to the font table */
+  // the current text encoding
+  private String _currentEncoding = DEFAULT_ENCODING;
+  // a stack of text encodings across groups
+  private final Stack _fontEncodingStack = new Stack();
+
+  private int _currentStyleValue = 0;
+  private final Map _styleMap = new HashMap();
+  private final Stack _styleStack = new Stack();
+  private String _currentStyle = NO_STYLE;
+
+  private int _where = IN_DOCUMENT;
+
+  private int _braceDepth = 0;
+  private String _newline;
+
+  // The delegate to which the parser forwards productions.
+  // Unless setDelegate is called, this will be the parser
+  // itself, which supplies a no-op implementation (see below).
+  // this enables us to avoid doing null checks in the delegate
+  // calls.
+
+  private RTFParserDelegate _delegate = this;
+
+  public static void main(String args[]) throws ParseException {
+    RTFParser parser = RTFParser.createParser(new InputStreamReader(System.in));
+    parser.parse();
+  }
+
+  public void reinitialize(Reader reader) {
+    ReInit(reader);
+  }
+
+  public static RTFParser createParser(Reader reader) {
+    return new RTFParser(reader);
+  }
+
+  public void parse() throws ParseException {
+    try {
+      document();
+    } catch (UnsupportedEncodingException uee) {
+      throw new ParseException("Could not decode bytes in encoding: " +
+                               uee.getMessage());
+    }
+  }
+
+  public void setDelegate(RTFParserDelegate delegate) {
+    _delegate = delegate;
+  }
+
+  public String getNewLine() {
+    return _newline;
+  }
+
+  public void setNewLine(String newline) {
+    _newline = newline;
+  }
+
+  /**
+   * Returns a numbered font which supports the encoding.
+   * This data is gleaned from the RTF fonttbl, and so
+   * is not available until after the fonttbl has been
+   * parsed.  No guarantees are made about which font
+   * will be returned if multiple fonts support the
+   * encoding.
+   *
+   * @return a font control word value.
+   */
+  public int getFontForEncoding(String encoding) {
+    for (Iterator i = _fontEncodingMap.entrySet().iterator(); i.hasNext();) {
+        Map.Entry entry = (Map.Entry)i.next();
+        if (entry.getValue().equals(encoding)) {
+           return ((Integer)entry.getKey()).intValue();
+        }
+    }
+    return -1;
+  }
+
+  // no-op implementation of RTFParserDelegate interface, for cases
+  // when delegate is not set.
+  public void text(String text, String style, int context) {
+      System.out.println(text);
+  }
+
+  public void controlSymbol(String controlSymbol, int context) {}
+
+  public void controlWord(String controlWord, int value, int context) {}
+
+  public void openGroup(int depth) {}
+
+  public void closeGroup(int depth) {}
+
+  public void styleList(List styles) {}
+
+  public void startDocument() {}
+
+  public void endDocument() {}
+
+  private void setCurrentEncoding(String encoding) {
+    if (null == encoding) {
+       throw new IllegalArgumentException("current encoding cannot be null");
+    }
+    _currentEncoding = encoding;
+  }
+
+  private String getCurrentEncoding() {
+    if (_where == IN_DOCUMENT) {
+      return _currentEncoding;
+    } else {
+      return _documentEncoding;
+    }
+  }
+
+  private String getCurrentStyle() {
+    return _currentStyle;
+  }
+
+  private void setCurrentStyle(String style) {
+    _currentStyle = style;
+  }
+
+  private Integer getCurrentSkipState() {
+    return _currentSkipState;
+  }
+
+  private void setCurrentSkipState(Integer skipState) {
+    _currentSkipState = skipState;
+  }
+
+  private void setDocumentEncoding(String encoding) {
+    if (null == encoding) {
+       throw new IllegalArgumentException("document encoding cannot be null");
+    }
+    _documentEncoding = encoding;
+  }
+
+  /**
+   * convenience method which downcasts the chars in str to a byte
+   * array without attempting to decode them.
+   */
+  private byte[] stringToBytes(String str) {
+    char[] cbuf = str.toCharArray();
+    byte[] buf = new byte[cbuf.length];
+    for (int i = 0; i < cbuf.length; i++) {
+      buf[i] = (byte)cbuf[i];
+    }
+    return buf;
+  }
+
+// end of CONTROL lexical specification
+
+/**************************************/
+/* grammatical productions begin here */
+/**************************************/
+
+/**
+ *  Sends the parser delegate a block of unicode text along with
+ *  the name of the style in which it was found and the location
+ *  in the document where it occurred.
+ *  All text encoding is resolved here so the delegate doesn't need
+ *  to concern itself with the various ways in which RTF encodes
+ *  non-ASCII strings.
+ */
+  final public void text() throws ParseException, UnsupportedEncodingException {
+  StringBuffer buf = new StringBuffer();
+  StringBuffer cbuf = new StringBuffer();
+  ByteArrayOutputStream baos = new ByteArrayOutputStream();
+  byte b;
+  byte[] raw;
+    label_1:
+    while (true) {
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case U:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+        case U:
+          u(cbuf);
+          raw = skip_after_unicode();
+        if (raw != null) {
+          cbuf.append(new String(raw, getCurrentEncoding()));
+        }
+          break;
+        case ESCAPED_LBRACE:
+        case ESCAPED_RBRACE:
+        case ESCAPED_BACKSLASH:
+          escaped(cbuf);
+          break;
+        case ESCAPED_NEWLINE:
+        case ESCAPED_CARRIAGE_RETURN:
+        case TAB:
+        case ZWJ:
+        case ZWNJ:
+        case PAR:
+        case LINE:
+        case EMDASH:
+        case ENDASH:
+        case EMSPACE:
+        case ENSPACE:
+        case BULLET:
+        case LQUOTE:
+        case RQUOTE:
+        case LTRMARK:
+        case RTLMARK:
+        case LDBLQUOTE:
+        case RDBLQUOTE:
+          special_character(cbuf);
+          break;
+        case NON_BREAKING_SPACE:
+        case OPTIONAL_HYPHEN:
+        case NON_BREAKING_HYPHEN:
+          textual_control_symbol(cbuf);
+          break;
+        default:
+          jj_la1[0] = jj_gen;
+          jj_consume_token(-1);
+          throw new ParseException();
+        }
+      if (baos.size() > 0) {
+        buf.append(baos.toString(getCurrentEncoding()));
+        baos.reset();
+      }
+      buf.append(cbuf.toString());
+      cbuf.setLength(0);
+        break;
+      case HEX_CHAR:
+        b = hex();
+                       baos.write(b);
+        break;
+      case TEXT:
+        raw = raw_text();
+                       baos.write(raw,0,raw.length);
+        break;
+      default:
+        jj_la1[1] = jj_gen;
+        jj_consume_token(-1);
+        throw new ParseException();
+      }
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case TEXT:
+      case HEX_CHAR:
+      case U:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+        ;
+        break;
+      default:
+        jj_la1[2] = jj_gen;
+        break label_1;
+      }
+    }
+    if (baos.size() > 0) {
+      buf.append(baos.toString(getCurrentEncoding()));
+      baos.reset();
+    }
+    if (_where == IN_STYLESHEET) {
+      _styleMap.put(new Integer(_currentStyleValue), buf.toString());
+    }
+    _delegate.text(buf.toString(), getCurrentStyle(), _where);
+  }
+
+  final public byte[] raw_text() throws ParseException, UnsupportedEncodingException {
+  Token tok;
+    tok = jj_consume_token(TEXT);
+               {if (true) return stringToBytes(tok.image);}
+    throw new Error("Missing return statement in function");
+  }
+
+  final public void escaped(StringBuffer buf) throws ParseException {
+  Token tok;
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case ESCAPED_BACKSLASH:
+      tok = jj_consume_token(ESCAPED_BACKSLASH);
+      break;
+    case ESCAPED_LBRACE:
+      tok = jj_consume_token(ESCAPED_LBRACE);
+      break;
+    case ESCAPED_RBRACE:
+      tok = jj_consume_token(ESCAPED_RBRACE);
+      break;
+    default:
+      jj_la1[3] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+    buf.append(tok.image.charAt(0));
+  }
+
+  final public void textual_control_symbol(StringBuffer buf) throws ParseException {
+  Token tok;
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case NON_BREAKING_SPACE:
+      tok = jj_consume_token(NON_BREAKING_SPACE);
+      break;
+    case OPTIONAL_HYPHEN:
+      tok = jj_consume_token(OPTIONAL_HYPHEN);
+      break;
+    case NON_BREAKING_HYPHEN:
+      tok = jj_consume_token(NON_BREAKING_HYPHEN);
+      break;
+    default:
+      jj_la1[4] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+    buf.append(tok.image);
+  }
+
+  final public byte hex() throws ParseException {
+  Token hex;
+    hex = jj_consume_token(HEX_CHAR);
+    byte b = (byte)Integer.parseInt(hex.image.substring(2), 16);
+    {if (true) return b;}
+    throw new Error("Missing return statement in function");
+  }
+
+  final public void special_character(StringBuffer buf) throws ParseException {
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case LINE:
+      jj_consume_token(LINE);
+                       buf.append('\r');
+      break;
+    case TAB:
+      jj_consume_token(TAB);
+                       buf.append('\t');
+      break;
+    case EMDASH:
+      jj_consume_token(EMDASH);
+                       buf.append('\u2014');
+      break;
+    case ENDASH:
+      jj_consume_token(ENDASH);
+                       buf.append('\u2013');
+      break;
+    case EMSPACE:
+      jj_consume_token(EMSPACE);
+                       buf.append('\u2003');
+      break;
+    case ENSPACE:
+      jj_consume_token(ENSPACE);
+                       buf.append(' ');
+      break;
+    case BULLET:
+      jj_consume_token(BULLET);
+                       buf.append('\u2022');
+      break;
+    case LQUOTE:
+      jj_consume_token(LQUOTE);
+                       buf.append('\u2018');
+      break;
+    case RQUOTE:
+      jj_consume_token(RQUOTE);
+                       buf.append('\u2019');
+      break;
+    case LDBLQUOTE:
+      jj_consume_token(LDBLQUOTE);
+                       buf.append('\u201c');
+      break;
+    case RDBLQUOTE:
+      jj_consume_token(RDBLQUOTE);
+                       buf.append('\u201d');
+      break;
+    case LTRMARK:
+      jj_consume_token(LTRMARK);
+                       buf.append('\u200e');
+      break;
+    case RTLMARK:
+      jj_consume_token(RTLMARK);
+                       buf.append('\u200f');
+      break;
+    case ZWJ:
+      jj_consume_token(ZWJ);
+                       buf.append('\u200d');
+      break;
+    case ZWNJ:
+      jj_consume_token(ZWNJ);
+                       buf.append('\u200c');
+      break;
+    case ESCAPED_NEWLINE:
+    case ESCAPED_CARRIAGE_RETURN:
+    case PAR:
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case PAR:
+        jj_consume_token(PAR);
+        break;
+      case ESCAPED_NEWLINE:
+        jj_consume_token(ESCAPED_NEWLINE);
+        break;
+      case ESCAPED_CARRIAGE_RETURN:
+        jj_consume_token(ESCAPED_CARRIAGE_RETURN);
+        break;
+      default:
+        jj_la1[5] = jj_gen;
+        jj_consume_token(-1);
+        throw new ParseException();
+      }
+            buf.append(getNewLine());
+      break;
+    default:
+      jj_la1[6] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+  }
+
+  final public void lbrace() throws ParseException {
+    jj_consume_token(LBRACE);
+    _fontEncodingStack.push(getCurrentEncoding());
+    _ucSkipStates.push(getCurrentSkipState());
+    _styleStack.push(getCurrentStyle());
+    _delegate.openGroup(++_braceDepth);
+  }
+
+  final public void rbrace() throws ParseException {
+    jj_consume_token(RBRACE);
+    setCurrentSkipState((Integer)_ucSkipStates.pop());
+    setCurrentEncoding((String)_fontEncodingStack.pop());
+    setCurrentStyle((String)_styleStack.pop());
+    _delegate.closeGroup(_braceDepth);
+    if (1 == --_braceDepth) { // leaving a table
+      if (_where == IN_STYLESHEET) {
+        _delegate.styleList(new ArrayList(_styleMap.values()));
+      }
+      _where = IN_DOCUMENT;
+    }
+  }
+
+  final public void table_declaration() throws ParseException {
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case INFO:
+      jj_consume_token(INFO);
+                                 _where = IN_INFO;
+      break;
+    case FONTTBL:
+      jj_consume_token(FONTTBL);
+                                 _where = IN_FONTTBL;
+      break;
+    case COLORTBL:
+      jj_consume_token(COLORTBL);
+                                 _where = IN_COLORTBL;
+      break;
+    case STYLESHEET:
+      jj_consume_token(STYLESHEET);
+                                 _where = IN_STYLESHEET;
+      break;
+    case LISTTABLE:
+      jj_consume_token(LISTTABLE);
+                                 _where = IN_LISTTABLE;
+      break;
+    case REVTBL:
+      jj_consume_token(REVTBL);
+                                 _where = IN_REVTBL;
+      break;
+    case PNTEXT:
+      jj_consume_token(PNTEXT);
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case CW_VAL:
+        jj_consume_token(CW_VAL);
+        break;
+      default:
+        jj_la1[7] = jj_gen;
+        ;
+      }
+                                 _where = IN_PNTEXT;
+      break;
+    case PNSECLVL:
+      jj_consume_token(PNSECLVL);
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case CW_VAL:
+        jj_consume_token(CW_VAL);
+        break;
+      default:
+        jj_la1[8] = jj_gen;
+        ;
+      }
+                                 _where = IN_PNTEXT;
+      break;
+    default:
+      jj_la1[9] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+  }
+
+  final public void control_symbol() throws ParseException {
+  Token sym = null;
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case CONTROL_SYM:
+      sym = jj_consume_token(CONTROL_SYM);
+      break;
+    case IGNORABLE_DESTINATION:
+      sym = jj_consume_token(IGNORABLE_DESTINATION);
+      break;
+    case FORMULA_CHARACTER:
+      sym = jj_consume_token(FORMULA_CHARACTER);
+      break;
+    case INDEX_SUBENTRY:
+      sym = jj_consume_token(INDEX_SUBENTRY);
+      break;
+    default:
+      jj_la1[10] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+    _delegate.controlSymbol(sym.image, _where);
+  }
+
+  final public Token mixed_case_control_word() throws ParseException {
+  Token word = null;
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case CLFITTEXT:
+      word = jj_consume_token(CLFITTEXT);
+      break;
+    case CLFTSWIDTH:
+      word = jj_consume_token(CLFTSWIDTH);
+      break;
+    case CLNOWRAP:
+      word = jj_consume_token(CLNOWRAP);
+      break;
+    case CLWWIDTH:
+      word = jj_consume_token(CLWWIDTH);
+      break;
+    case TDFRMTXTBOTTOM:
+      word = jj_consume_token(TDFRMTXTBOTTOM);
+      break;
+    case TDFRMTXTLEFT:
+      word = jj_consume_token(TDFRMTXTLEFT);
+      break;
+    case TDFRMTXTRIGHT:
+      word = jj_consume_token(TDFRMTXTRIGHT);
+      break;
+    case TDFRMTXTTOP:
+      word = jj_consume_token(TDFRMTXTTOP);
+      break;
+    case TRFTSWIDTHA:
+      word = jj_consume_token(TRFTSWIDTHA);
+      break;
+    case TRFTSWIDTHB:
+      word = jj_consume_token(TRFTSWIDTHB);
+      break;
+    case TRFTSWIDTH:
+      word = jj_consume_token(TRFTSWIDTH);
+      break;
+    case TRWWIDTHA:
+      word = jj_consume_token(TRWWIDTHA);
+      break;
+    case TRWWIDTHB:
+      word = jj_consume_token(TRWWIDTHB);
+      break;
+    case TRWWIDTH:
+      word = jj_consume_token(TRWWIDTH);
+      break;
+    case SECTSPECIFYGENN:
+      word = jj_consume_token(SECTSPECIFYGENN);
+      break;
+    default:
+      jj_la1[11] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+      {if (true) return word;}
+    throw new Error("Missing return statement in function");
+  }
+
+  final public void control_word() throws ParseException {
+  Token word = null, val = null;
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case CONTROL_WORD:
+      word = jj_consume_token(CONTROL_WORD);
+      break;
+    case CLFITTEXT:
+    case CLFTSWIDTH:
+    case CLNOWRAP:
+    case CLWWIDTH:
+    case TDFRMTXTBOTTOM:
+    case TDFRMTXTLEFT:
+    case TDFRMTXTRIGHT:
+    case TDFRMTXTTOP:
+    case TRFTSWIDTHA:
+    case TRFTSWIDTHB:
+    case TRFTSWIDTH:
+    case TRWWIDTHA:
+    case TRWWIDTHB:
+    case TRWWIDTH:
+    case SECTSPECIFYGENN:
+      word = mixed_case_control_word();
+      break;
+    default:
+      jj_la1[12] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case CW_VAL:
+      val = jj_consume_token(CW_VAL);
+      break;
+    default:
+      jj_la1[13] = jj_gen;
+      ;
+    }
+    int v = null == val ? 0 : Integer.parseInt(val.image);
+    _delegate.controlWord(word.image, v, _where);
+  }
+
+  final public void u(StringBuffer buf) throws ParseException {
+  Token val;
+    jj_consume_token(U);
+    val = jj_consume_token(CW_VAL);
+    int ucValue = Integer.parseInt(val.image);
+    // correct RTF negative unicode char value
+    if (ucValue < 0) {
+      ucValue += 65536;
+    }
+    buf.append((char)ucValue);
+  }
+
+  byte[] skip_after_unicode() throws ParseException, UnsupportedEncodingException {
+  Token tok;
+  byte[] raw = null;
+
+  for (int skip = getCurrentSkipState().intValue(); skip != 0; skip--) {
+    tok = getNextToken();
+    switch (tok.kind) {
+    case HEX_CHAR:
+      break; // buh bye!
+    case TEXT:
+      if (tok.image.length() > skip) {
+         byte[] tmp = stringToBytes(tok.image);
+         raw = new byte[ tmp.length - skip ];
+         System.arraycopy(tmp,skip,raw,0,raw.length);
+         return raw;
+      }
+      break; // the text was exactly what we needed: buh bye!
+    default:
+      throw new IllegalStateException("unexpected token while skipping");
+    }
+  }
+  return raw;
+  }
+
+  final public void uc() throws ParseException {
+  Token word = null, val = null;
+    word = jj_consume_token(UC);
+    val = jj_consume_token(CW_VAL);
+    int bytesToSkip = null == val ? 0 : Integer.parseInt(val.image);
+    setCurrentSkipState(new Integer(bytesToSkip));
+  }
+
+  final public void fcharset() throws ParseException {
+  Token word = null, val = null;
+    word = jj_consume_token(FCHARSET);
+    val = jj_consume_token(CW_VAL);
+    int charset = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_FONTTBL == _where) {
+      // Modified: always use _documentEncoding
+      _fontEncodingMap.put(new Integer(_currentFontValue),
+                           /*CHARSET_ENCODING_TABLE[charset]*/_documentEncoding);
+    } else {
+      // this shouldn't happen -- forward onto delegate?
+    }
+  }
+
+  final public void deff() throws ParseException {
+  Token val = null;
+    jj_consume_token(DEFF);
+    val = jj_consume_token(CW_VAL);
+
+  }
+
+  final public void f() throws ParseException {
+  Token val = null;
+    jj_consume_token(F);
+    val = jj_consume_token(CW_VAL);
+    int font = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_FONTTBL == _where) {
+      _currentFontValue = font;
+    } else if (IN_DOCUMENT == _where) {
+      String encoding = (String)_fontEncodingMap.get(new Integer(font));
+      setCurrentEncoding(null == encoding ? DEFAULT_ENCODING : encoding);
+    } else {
+      // consume this font event
+    }
+  }
+
+  final public void cs() throws ParseException {
+  Token val = null;
+    jj_consume_token(CS);
+    val = jj_consume_token(CW_VAL);
+    int style = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_STYLESHEET == _where) {
+      _currentStyleValue = style;
+    } else if (IN_DOCUMENT == _where) {
+      setCurrentStyle((String)_styleMap.get(new Integer(style)));
+    } else {
+      // consume this style event
+    }
+  }
+
+  final public void plain() throws ParseException {
+    jj_consume_token(PLAIN);
+            setCurrentStyle(NO_STYLE);
+  }
+
+/* these productions identify the document encoding; note that they
+ * are almost always clobbered by an \ansicpg or by unicode characters */
+  final public void document_charset() throws ParseException {
+    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+    case PC:
+      jj_consume_token(PC);
+              setDocumentEncoding(getJavaEncoding(437));
+      break;
+    case PCA:
+      jj_consume_token(PCA);
+              setDocumentEncoding(getJavaEncoding(850));
+      break;
+    case MAC:
+      jj_consume_token(MAC);
+              setDocumentEncoding("MacRoman");
+      break;
+    case ANSI:
+      jj_consume_token(ANSI);
+               setDocumentEncoding(getJavaEncoding(1252));
+      break;
+    default:
+      jj_la1[14] = jj_gen;
+      jj_consume_token(-1);
+      throw new ParseException();
+    }
+  }
+
+/* specifies the ANSI codepage to use as the document's encoding. Subject
+ * to local overrides. */
+  final public void ansicpg() throws ParseException {
+  Token val = null;
+    jj_consume_token(ANSICPG);
+    val = jj_consume_token(CW_VAL);
+    // must be a value in the map - we should throw if it isn't there.
+    int cp = null == val ? 0 : Integer.parseInt(val.image);
+    setDocumentEncoding(getJavaEncoding(cp));
+    setCurrentEncoding(getJavaEncoding(cp)); /* Modified: added this line */
+
+  }
+
+// TODO: consider collecting special characters in a buffer
+  final public void group() throws ParseException, UnsupportedEncodingException {
+    lbrace();
+    label_2:
+    while (true) {
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case INFO:
+      case REVTBL:
+      case PNTEXT:
+      case FONTTBL:
+      case COLORTBL:
+      case PNSECLVL:
+      case LISTTABLE:
+      case STYLESHEET:
+        table_declaration();
+        break;
+      case UC:
+        uc();
+        break;
+      case F:
+        f();
+        break;
+      case FCHARSET:
+        fcharset();
+        break;
+      case CS:
+        cs();
+        break;
+      case PLAIN:
+        plain();
+        break;
+      case CLFITTEXT:
+      case CLFTSWIDTH:
+      case CLNOWRAP:
+      case CLWWIDTH:
+      case TDFRMTXTBOTTOM:
+      case TDFRMTXTLEFT:
+      case TDFRMTXTRIGHT:
+      case TDFRMTXTTOP:
+      case TRFTSWIDTHA:
+      case TRFTSWIDTHB:
+      case TRFTSWIDTH:
+      case TRWWIDTHA:
+      case TRWWIDTHB:
+      case TRWWIDTH:
+      case SECTSPECIFYGENN:
+      case CONTROL_WORD:
+        control_word();
+        break;
+      case IGNORABLE_DESTINATION:
+      case FORMULA_CHARACTER:
+      case INDEX_SUBENTRY:
+      case CONTROL_SYM:
+        control_symbol();
+        break;
+      case LBRACE:
+        group();
+        break;
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case TEXT:
+      case HEX_CHAR:
+      case U:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+        text();
+        break;
+      default:
+        jj_la1[15] = jj_gen;
+        jj_consume_token(-1);
+        throw new ParseException();
+      }
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case LBRACE:
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case IGNORABLE_DESTINATION:
+      case FORMULA_CHARACTER:
+      case INDEX_SUBENTRY:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case CONTROL_SYM:
+      case TEXT:
+      case HEX_CHAR:
+      case U:
+      case UC:
+      case F:
+      case CS:
+      case FCHARSET:
+      case PLAIN:
+      case INFO:
+      case REVTBL:
+      case PNTEXT:
+      case FONTTBL:
+      case COLORTBL:
+      case PNSECLVL:
+      case LISTTABLE:
+      case STYLESHEET:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+      case CLFITTEXT:
+      case CLFTSWIDTH:
+      case CLNOWRAP:
+      case CLWWIDTH:
+      case TDFRMTXTBOTTOM:
+      case TDFRMTXTLEFT:
+      case TDFRMTXTRIGHT:
+      case TDFRMTXTTOP:
+      case TRFTSWIDTHA:
+      case TRFTSWIDTHB:
+      case TRFTSWIDTH:
+      case TRWWIDTHA:
+      case TRWWIDTHB:
+      case TRWWIDTH:
+      case SECTSPECIFYGENN:
+      case CONTROL_WORD:
+        ;
+        break;
+      default:
+        jj_la1[16] = jj_gen;
+        break label_2;
+      }
+    }
+    rbrace();
+  }
+
+  final public void document() throws ParseException, UnsupportedEncodingException {
+    _delegate.startDocument();
+    lbrace();
+    jj_consume_token(RTF);
+    jj_consume_token(CW_VAL);
+    document_charset();
+    label_3:
+    while (true) {
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case UC:
+      case ANSICPG:
+      case DEFF:
+        ;
+        break;
+      default:
+        jj_la1[17] = jj_gen;
+        break label_3;
+      }
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case UC:
+        uc();
+        break;
+      case ANSICPG:
+        ansicpg();
+        break;
+      case DEFF:
+        deff();
+        break;
+      default:
+        jj_la1[18] = jj_gen;
+        jj_consume_token(-1);
+        throw new ParseException();
+      }
+    }
+    label_4:
+    while (true) {
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case UC:
+        uc();
+        break;
+      case F:
+        f();
+        break;
+      case CS:
+        cs();
+        break;
+      case PLAIN:
+        plain();
+        break;
+      case CLFITTEXT:
+      case CLFTSWIDTH:
+      case CLNOWRAP:
+      case CLWWIDTH:
+      case TDFRMTXTBOTTOM:
+      case TDFRMTXTLEFT:
+      case TDFRMTXTRIGHT:
+      case TDFRMTXTTOP:
+      case TRFTSWIDTHA:
+      case TRFTSWIDTHB:
+      case TRFTSWIDTH:
+      case TRWWIDTHA:
+      case TRWWIDTHB:
+      case TRWWIDTH:
+      case SECTSPECIFYGENN:
+      case CONTROL_WORD:
+        control_word();
+        break;
+      case IGNORABLE_DESTINATION:
+      case FORMULA_CHARACTER:
+      case INDEX_SUBENTRY:
+      case CONTROL_SYM:
+        control_symbol();
+        break;
+      case LBRACE:
+        group();
+        break;
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case TEXT:
+      case HEX_CHAR:
+      case U:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+        text();
+        break;
+      default:
+        jj_la1[19] = jj_gen;
+        jj_consume_token(-1);
+        throw new ParseException();
+      }
+      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
+      case LBRACE:
+      case NON_BREAKING_SPACE:
+      case OPTIONAL_HYPHEN:
+      case NON_BREAKING_HYPHEN:
+      case ESCAPED_NEWLINE:
+      case ESCAPED_CARRIAGE_RETURN:
+      case IGNORABLE_DESTINATION:
+      case FORMULA_CHARACTER:
+      case INDEX_SUBENTRY:
+      case ESCAPED_LBRACE:
+      case ESCAPED_RBRACE:
+      case ESCAPED_BACKSLASH:
+      case CONTROL_SYM:
+      case TEXT:
+      case HEX_CHAR:
+      case U:
+      case UC:
+      case F:
+      case CS:
+      case PLAIN:
+      case TAB:
+      case ZWJ:
+      case ZWNJ:
+      case PAR:
+      case LINE:
+      case EMDASH:
+      case ENDASH:
+      case EMSPACE:
+      case ENSPACE:
+      case BULLET:
+      case LQUOTE:
+      case RQUOTE:
+      case LTRMARK:
+      case RTLMARK:
+      case LDBLQUOTE:
+      case RDBLQUOTE:
+      case CLFITTEXT:
+      case CLFTSWIDTH:
+      case CLNOWRAP:
+      case CLWWIDTH:
+      case TDFRMTXTBOTTOM:
+      case TDFRMTXTLEFT:
+      case TDFRMTXTRIGHT:
+      case TDFRMTXTTOP:
+      case TRFTSWIDTHA:
+      case TRFTSWIDTHB:
+      case TRFTSWIDTH:
+      case TRWWIDTHA:
+      case TRWWIDTHB:
+      case TRWWIDTH:
+      case SECTSPECIFYGENN:
+      case CONTROL_WORD:
+        ;
+        break;
+      default:
+        jj_la1[20] = jj_gen;
+        break label_4;
+      }
+    }
+    rbrace();
+    _delegate.endDocument();
+  }
+
+  public RTFParserTokenManager token_source;
+  SimpleCharStream jj_input_stream;
+  public Token token, jj_nt;
+  private int jj_ntk;
+  private int jj_gen;
+  final private int[] jj_la1 = new int[21];
+  static private int[] jj_la1_0;
+  static private int[] jj_la1_1;
+  static private int[] jj_la1_2;
+  static {
+      jj_la1_0();
+      jj_la1_1();
+      jj_la1_2();
+   }
+   private static void jj_la1_0() {
+      jj_la1_0 = new int[] {0x8071f00,0x8571f00,0x8571f00,0x70000,0x700,0x1800,0x1800,0x0,0x0,0x0,0x8e000,0x0,0x0,0x0,0x0,0xf85fff40,0xf85fff40,0x10000000,0x10000000,0x785fff40,0x785fff40,};
+   }
+   private static void jj_la1_1() {
+      jj_la1_1 = new int[] {0xffff0000,0xffff0000,0xffff0000,0x0,0x0,0x80000,0xffff0000,0x0,0x0,0xff00,0x0,0x0,0x0,0x0,0x2e,0xffffff01,0xffffff01,0xc0,0xc0,0xffff0001,0xffff0001,};
+   }
+   private static void jj_la1_2() {
+      jj_la1_2 = new int[] {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x40000,0x40000,0x0,0x0,0x7fff,0x17fff,0x40000,0x0,0x17fff,0x17fff,0x0,0x0,0x17fff,0x17fff,};
+   }
+
+  public RTFParser(java.io.InputStream stream) {
+     this(stream, null);
+  }
+  public RTFParser(java.io.InputStream stream, String encoding) {
+    try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
+    token_source = new RTFParserTokenManager(jj_input_stream);
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  public void ReInit(java.io.InputStream stream) {
+     ReInit(stream, null);
+  }
+  public void ReInit(java.io.InputStream stream, String encoding) {
+    try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
+    token_source.ReInit(jj_input_stream);
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  public RTFParser(java.io.Reader stream) {
+    jj_input_stream = new SimpleCharStream(stream, 1, 1);
+    token_source = new RTFParserTokenManager(jj_input_stream);
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  public void ReInit(java.io.Reader stream) {
+    jj_input_stream.ReInit(stream, 1, 1);
+    token_source.ReInit(jj_input_stream);
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  public RTFParser(RTFParserTokenManager tm) {
+    token_source = tm;
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  public void ReInit(RTFParserTokenManager tm) {
+    token_source = tm;
+    token = new Token();
+    jj_ntk = -1;
+    jj_gen = 0;
+    for (int i = 0; i < 21; i++) jj_la1[i] = -1;
+  }
+
+  final private Token jj_consume_token(int kind) throws ParseException {
+    Token oldToken;
+    if ((oldToken = token).next != null) token = token.next;
+    else token = token.next = token_source.getNextToken();
+    jj_ntk = -1;
+    if (token.kind == kind) {
+      jj_gen++;
+      return token;
+    }
+    token = oldToken;
+    jj_kind = kind;
+    throw generateParseException();
+  }
+
+  final public Token getNextToken() {
+    if (token.next != null) token = token.next;
+    else token = token.next = token_source.getNextToken();
+    jj_ntk = -1;
+    jj_gen++;
+    return token;
+  }
+
+  final public Token getToken(int index) {
+    Token t = token;
+    for (int i = 0; i < index; i++) {
+      if (t.next != null) t = t.next;
+      else t = t.next = token_source.getNextToken();
+    }
+    return t;
+  }
+
+  final private int jj_ntk() {
+    if ((jj_nt=token.next) == null)
+      return (jj_ntk = (token.next=token_source.getNextToken()).kind);
+    else
+      return (jj_ntk = jj_nt.kind);
+  }
+
+  private java.util.Vector jj_expentries = new java.util.Vector();
+  private int[] jj_expentry;
+  private int jj_kind = -1;
+
+  public ParseException generateParseException() {
+    jj_expentries.removeAllElements();
+    boolean[] la1tokens = new boolean[84];
+    for (int i = 0; i < 84; i++) {
+      la1tokens[i] = false;
+    }
+    if (jj_kind >= 0) {
+      la1tokens[jj_kind] = true;
+      jj_kind = -1;
+    }
+    for (int i = 0; i < 21; i++) {
+      if (jj_la1[i] == jj_gen) {
+        for (int j = 0; j < 32; j++) {
+          if ((jj_la1_0[i] & (1<<j)) != 0) {
+            la1tokens[j] = true;
+          }
+          if ((jj_la1_1[i] & (1<<j)) != 0) {
+            la1tokens[32+j] = true;
+          }
+          if ((jj_la1_2[i] & (1<<j)) != 0) {
+            la1tokens[64+j] = true;
+          }
+        }
+      }
+    }
+    for (int i = 0; i < 84; i++) {
+      if (la1tokens[i]) {
+        jj_expentry = new int[1];
+        jj_expentry[0] = i;
+        jj_expentries.addElement(jj_expentry);
+      }
+    }
+    int[][] exptokseq = new int[jj_expentries.size()][];
+    for (int i = 0; i < jj_expentries.size(); i++) {
+      exptokseq[i] = (int[])jj_expentries.elementAt(i);
+    }
+    return new ParseException(token, exptokseq, tokenImage);
+  }
+
+  final public void enable_tracing() {
+  }
+
+  final public void disable_tracing() {
+  }
+
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/ParseException.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/ParseException.java	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/ParseException.java	(revision 0)
@@ -0,0 +1,207 @@
+/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 3.0 */
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+/**
+ * This exception is thrown when parse errors are encountered.
+ * You can explicitly create objects of this exception type by
+ * calling the method generateParseException in the generated
+ * parser.
+ *
+ * You can modify this class to customize your error reporting
+ * mechanisms so long as you retain the public fields.
+ */
+public class ParseException extends Exception {
+
+  /**
+   * This constructor is used by the method "generateParseException"
+   * in the generated parser.  Calling this constructor generates
+   * a new object of this type with the fields "currentToken",
+   * "expectedTokenSequences", and "tokenImage" set.  The boolean
+   * flag "specialConstructor" is also set to true to indicate that
+   * this constructor was used to create this object.
+   * This constructor calls its super class with the empty string
+   * to force the "toString" method of parent class "Throwable" to
+   * print the error message in the form:
+   *     ParseException: <result of getMessage>
+   */
+  public ParseException(Token currentTokenVal,
+                        int[][] expectedTokenSequencesVal,
+                        String[] tokenImageVal
+                       )
+  {
+    super("");
+    specialConstructor = true;
+    currentToken = currentTokenVal;
+    expectedTokenSequences = expectedTokenSequencesVal;
+    tokenImage = tokenImageVal;
+  }
+
+  /**
+   * The following constructors are for use by you for whatever
+   * purpose you can think of.  Constructing the exception in this
+   * manner makes the exception behave in the normal way - i.e., as
+   * documented in the class "Throwable".  The fields "errorToken",
+   * "expectedTokenSequences", and "tokenImage" do not contain
+   * relevant information.  The JavaCC generated code does not use
+   * these constructors.
+   */
+
+  public ParseException() {
+    super();
+    specialConstructor = false;
+  }
+
+  public ParseException(String message) {
+    super(message);
+    specialConstructor = false;
+  }
+
+  /**
+   * This variable determines which constructor was used to create
+   * this object and thereby affects the semantics of the
+   * "getMessage" method (see below).
+   */
+  protected boolean specialConstructor;
+
+  /**
+   * This is the last token that has been consumed successfully.  If
+   * this object has been created due to a parse error, the token
+   * followng this token will (therefore) be the first error token.
+   */
+  public Token currentToken;
+
+  /**
+   * Each entry in this array is an array of integers.  Each array
+   * of integers represents a sequence of tokens (by their ordinal
+   * values) that is expected at this point of the parse.
+   */
+  public int[][] expectedTokenSequences;
+
+  /**
+   * This is a reference to the "tokenImage" array of the generated
+   * parser within which the parse error occurred.  This array is
+   * defined in the generated ...Constants interface.
+   */
+  public String[] tokenImage;
+
+  /**
+   * This method has the standard behavior when this object has been
+   * created using the standard constructors.  Otherwise, it uses
+   * "currentToken" and "expectedTokenSequences" to generate a parse
+   * error message and returns it.  If this object has been created
+   * due to a parse error, and you do not catch it (it gets thrown
+   * from the parser), then this method is called during the printing
+   * of the final stack trace, and hence the correct error message
+   * gets displayed.
+   */
+  public String getMessage() {
+    if (!specialConstructor) {
+      return super.getMessage();
+    }
+    StringBuffer expected = new StringBuffer();
+    int maxSize = 0;
+    for (int i = 0; i < expectedTokenSequences.length; i++) {
+      if (maxSize < expectedTokenSequences[i].length) {
+        maxSize = expectedTokenSequences[i].length;
+      }
+      for (int j = 0; j < expectedTokenSequences[i].length; j++) {
+        expected.append(tokenImage[expectedTokenSequences[i][j]]).append(" ");
+      }
+      if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
+        expected.append("...");
+      }
+      expected.append(eol).append("    ");
+    }
+    String retval = "Encountered \"";
+    Token tok = currentToken.next;
+    for (int i = 0; i < maxSize; i++) {
+      if (i != 0) retval += " ";
+      if (tok.kind == 0) {
+        retval += tokenImage[0];
+        break;
+      }
+      retval += add_escapes(tok.image);
+      tok = tok.next; 
+    }
+    retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
+    retval += "." + eol;
+    if (expectedTokenSequences.length == 1) {
+      retval += "Was expecting:" + eol + "    ";
+    } else {
+      retval += "Was expecting one of:" + eol + "    ";
+    }
+    retval += expected.toString();
+    return retval;
+  }
+
+  /**
+   * The end of line string for this machine.
+   */
+  protected String eol = System.getProperty("line.separator", "\n");
+ 
+  /**
+   * Used to convert raw characters to their escaped version
+   * when these raw version cannot be used as part of an ASCII
+   * string literal.
+   */
+  protected String add_escapes(String str) {
+      StringBuffer retval = new StringBuffer();
+      char ch;
+      for (int i = 0; i < str.length(); i++) {
+        switch (str.charAt(i))
+        {
+           case 0 :
+              continue;
+           case '\b':
+              retval.append("\\b");
+              continue;
+           case '\t':
+              retval.append("\\t");
+              continue;
+           case '\n':
+              retval.append("\\n");
+              continue;
+           case '\f':
+              retval.append("\\f");
+              continue;
+           case '\r':
+              retval.append("\\r");
+              continue;
+           case '\"':
+              retval.append("\\\"");
+              continue;
+           case '\'':
+              retval.append("\\\'");
+              continue;
+           case '\\':
+              retval.append("\\\\");
+              continue;
+           default:
+              if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
+                 String s = "0000" + Integer.toString(ch, 16);
+                 retval.append("\\u" + s.substring(s.length() - 4, s.length()));
+              } else {
+                 retval.append(ch);
+              }
+              continue;
+        }
+      }
+      return retval.toString();
+   }
+
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.jj
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.jj	(revision 0)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParser.jj	(revision 0)
@@ -0,0 +1,1045 @@
+options {
+  STATIC = false;
+  //DEBUG_PARSER = true;
+  //DEBUG_TOKEN_MANAGER=true;
+  UNICODE_INPUT = true;
+}
+ 
+PARSER_BEGIN(RTFParser)
+
+/*
+ *  Copyright 2005 Blandware (http://www.blandware.com)
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.nutch.parse.rtf;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>RTFParser</p>
+ * <p>Some methods of this are generated by javaCC</p>
+ * @author &lt;eric@etranslate.com&gt;
+ * @author Roman Puchkovskiy <a href="mailto:roman.puchkovskiy@blandware.com">
+ * &lt;roman.puchkovskiy@blandware.com&gt;</a>
+ * @version $Revision: 1.1 $ $Date: 2005/03/26 14:26:06 $
+ */
+public class RTFParser implements RTFParserDelegate {
+
+  /* maps windows character sets to java encoding names */
+  /* note: sparse array */
+  private static final String[] CHARSET_ENCODING_TABLE = new String[255];
+  static {
+        CHARSET_ENCODING_TABLE[0] = "Cp1252";     // ANSI
+        CHARSET_ENCODING_TABLE[1] = "Cp1252";     // Default
+        CHARSET_ENCODING_TABLE[2] = "Cp1252";     // Symbol
+        CHARSET_ENCODING_TABLE[3] = null;         // Invalid
+        CHARSET_ENCODING_TABLE[77] = "MacRoman";  // Mac
+        CHARSET_ENCODING_TABLE[128] = "MS932";    // Shift JIS
+        CHARSET_ENCODING_TABLE[129] = "MS949";    // Hangul
+        CHARSET_ENCODING_TABLE[130] = "Johab";    // Johab
+        CHARSET_ENCODING_TABLE[134] = "MS936";    // GB2312
+        CHARSET_ENCODING_TABLE[136] = "MS950";    // Big5
+        CHARSET_ENCODING_TABLE[161] = "Cp1253";   // Greek
+        CHARSET_ENCODING_TABLE[162] = "Cp1254";   // Turkish
+        CHARSET_ENCODING_TABLE[163] = "Cp1258";   // Vietnamese
+        CHARSET_ENCODING_TABLE[177] = "Cp1255";   // Hebrew
+        CHARSET_ENCODING_TABLE[178] = "Cp1256";   // Arabic
+        CHARSET_ENCODING_TABLE[179] = "Cp1256";   // Arabic Traditional
+        CHARSET_ENCODING_TABLE[180] = "Cp1256";   // Arabic User
+        CHARSET_ENCODING_TABLE[181] = "Cp1255";   // Hebrew User
+        CHARSET_ENCODING_TABLE[186] = "Cp1257";   // Baltic
+        CHARSET_ENCODING_TABLE[204] = "Cp866";    // Russian
+        CHARSET_ENCODING_TABLE[222] = "MS874";    // Thai
+        CHARSET_ENCODING_TABLE[238] = "Cp1250";   // East European
+        CHARSET_ENCODING_TABLE[254] = "Cp437";    // PC 437
+  }
+
+  /*
+   * These next two tables map windows codepages to java encoding names.
+   * The codepage ints are too large to do a sparse array, so we have
+   * two parallel arrays and do a binary search to find the common offset.
+   */
+
+  private static final int[] RTF_CODEPAGE = {
+        437, // United States IBM 
+
+        /*  Not supported by JDK 1.3.1
+        708, // Arabic (ASMO 708) 
+        709, // Arabic (ASMO 449+, BCON V4) 
+        710, // Arabic (transparent Arabic) 
+        711, // Arabic (Nafitha Enhanced) 
+        720, // Arabic (transparent ASMO) 
+        */
+
+        819, // Windows 3.1 (United States and Western Europe) 
+        850, // IBM multilingual 
+        852, // Eastern European 
+        860, // Portuguese 
+        862, // Hebrew 
+        863, // French Canadian 
+        864, // Arabic 
+        865, // Norwegian 
+        866, // Soviet Union 
+        874, // Thai 
+        932, // Japanese 
+        936, // Simplified Chinese 
+        949, // Korean 
+        950, // Traditional Chinese 
+        1250, // Windows 3.1 (Eastern European) 
+        1251, // Windows 3.1 (Cyrillic) 
+        1252, // Western European 
+        1253, // Greek 
+        1254, // Turkish 
+        1255, // Hebrew 
+        1256, // Arabic 
+        1257, // Baltic 
+        1258, // Vietnamese 
+        1361  // Johab
+  };
+
+  private static final String[] JAVA_ENCODINGS = {
+        "Cp437", // United States IBM 
+        /*  Not supported by JDK 1.3.1
+        "Cp708", // Arabic (ASMO 708) 
+        "Cp709", // Arabic (ASMO 449+, BCON V4) 
+        "Cp710", // Arabic (transparent Arabic) 
+        "Cp711", // Arabic (Nafitha Enhanced) 
+        "Cp720", // Arabic (transparent ASMO) 
+        */
+        "Cp819", // Windows 3.1 (United States and Western Europe) 
+        "Cp850", // IBM multilingual 
+        "Cp852", // Eastern European 
+        "Cp860", // Portuguese 
+        "Cp862", // Hebrew 
+        "Cp863", // French Canadian 
+        "Cp864", // Arabic 
+        "Cp865", // Norwegian 
+        "Cp866", // Soviet Union 
+        "MS874", // Thai 
+        "MS932", // Japanese 
+        "MS936", // Simplified Chinese 
+        "MS949", // Korean 
+        "MS950", // Traditional Chinese 
+        "Cp1250", // Windows 3.1 (Eastern European) 
+        "Cp1251", // Windows 3.1 (Cyrillic) 
+        "Cp1252", // Western European 
+        "Cp1253", // Greek 
+        "Cp1254", // Turkish 
+        "Cp1255", // Hebrew 
+        "Cp1256", // Arabic 
+        "Cp1257", // Baltic 
+        "Cp1258", // Vietnamese 
+        "Johab"  // Johab
+  };
+
+  /**
+   * Searches RTF_CODEPAGE table for the offset of rtfCodepage and returns
+   * the corresponding encoding name from the JAVA_ENCODINGS table, or
+   * null if none is present.
+   */
+  private static final String getJavaEncoding(int rtfCodepage) {
+    int offset = Arrays.binarySearch(RTF_CODEPAGE, rtfCodepage);
+    return offset < 0 ? null : JAVA_ENCODINGS[offset];
+  }
+
+  /* support for skipping bytes after a unicode character.
+   * TODO: handle \bin
+   */
+  // the default number of bytes to skip after a unicode character
+  private static final Integer DEFAULT_SKIP_STATE = new Integer(1);
+  // the current number of bytes to skip after a unicode character
+  private Integer _currentSkipState = DEFAULT_SKIP_STATE;
+  // a stack of skip states for bytes following a unicode character
+  private final Stack _ucSkipStates = new Stack();
+
+  // the default encoding for all RTF documents
+  private static final String DEFAULT_ENCODING = "Cp1252";
+  // the document encoding for this RTF document
+  private String _documentEncoding = DEFAULT_ENCODING;
+
+  /* support for parsing the \fonttbl to discover font codes and
+   * their assigned encodings
+   */
+  // this holds the font table key (\fN) while we're waiting for the
+  // charset (\fcharsetN) declaration in the font table.
+  private int _currentFontValue = 0;
+  // this maps font codes (\fN) to the encodings assigned (\fcharsetN)
+  // in the fonttbl
+  private final Map _fontEncodingMap = new HashMap();
+
+  /** support for encoding changes via references to the font table */
+  // the current text encoding
+  private String _currentEncoding = DEFAULT_ENCODING;
+  // a stack of text encodings across groups
+  private final Stack _fontEncodingStack = new Stack();
+
+  private int _currentStyleValue = 0;
+  private final Map _styleMap = new HashMap();
+  private final Stack _styleStack = new Stack();
+  private String _currentStyle = NO_STYLE;
+
+  private int _where = IN_DOCUMENT;
+
+  private int _braceDepth = 0;
+  private String _newline;
+
+  // The delegate to which the parser forwards productions.
+  // Unless setDelegate is called, this will be the parser
+  // itself, which supplies a no-op implementation (see below).
+  // this enables us to avoid doing null checks in the delegate
+  // calls.
+
+  private RTFParserDelegate _delegate = this;
+
+  public static void main(String args[]) throws ParseException {
+    RTFParser parser = RTFParser.createParser(new InputStreamReader(System.in));
+    parser.parse();
+  }
+
+  public void reinitialize(Reader reader) {
+    ReInit(reader);
+  }
+
+  public static RTFParser createParser(Reader reader) {
+    return new RTFParser(reader);
+  }
+
+  public void parse() throws ParseException {
+    try {
+      document();
+    } catch (UnsupportedEncodingException uee) {
+      throw new ParseException("Could not decode bytes in encoding: " +
+                               uee.getMessage());
+    }
+  }
+
+  public void setDelegate(RTFParserDelegate delegate) {
+    _delegate = delegate;
+  }
+
+  public String getNewLine() {
+    return _newline;
+  }
+
+  public void setNewLine(String newline) {
+    _newline = newline;
+  }
+
+  /**
+   * Returns a numbered font which supports the encoding.
+   * This data is gleaned from the RTF fonttbl, and so
+   * is not available until after the fonttbl has been
+   * parsed.  No guarantees are made about which font
+   * will be returned if multiple fonts support the
+   * encoding.
+   *
+   * @return a font control word value.
+   */
+  public int getFontForEncoding(String encoding) {
+    for (Iterator i = _fontEncodingMap.entrySet().iterator(); i.hasNext();) {
+        Map.Entry entry = (Map.Entry)i.next();
+        if (entry.getValue().equals(encoding)) {
+           return ((Integer)entry.getKey()).intValue();
+        }
+    }
+    return -1;
+  }
+
+  // no-op implementation of RTFParserDelegate interface, for cases
+  // when delegate is not set.
+  public void text(String text, String style, int context) {
+      System.out.println(text);
+  }
+
+  public void controlSymbol(String controlSymbol, int context) {}
+
+  public void controlWord(String controlWord, int value, int context) {}
+
+  public void openGroup(int depth) {}
+
+  public void closeGroup(int depth) {}
+
+  public void styleList(List styles) {}
+
+  public void startDocument() {}
+
+  public void endDocument() {}
+
+  private void setCurrentEncoding(String encoding) {
+    if (null == encoding) {
+       throw new IllegalArgumentException("current encoding cannot be null");
+    }
+    _currentEncoding = encoding;
+  }
+
+  private String getCurrentEncoding() {
+    if (_where == IN_DOCUMENT) {
+      return _currentEncoding;
+    } else {
+      return _documentEncoding;
+    }
+  }
+
+  private String getCurrentStyle() {
+    return _currentStyle;
+  }
+
+  private void setCurrentStyle(String style) {
+    _currentStyle = style;
+  }
+
+  private Integer getCurrentSkipState() {
+    return _currentSkipState;
+  }
+
+  private void setCurrentSkipState(Integer skipState) {
+    _currentSkipState = skipState;
+  }
+
+  private void setDocumentEncoding(String encoding) {
+    if (null == encoding) {
+       throw new IllegalArgumentException("document encoding cannot be null");
+    }
+    _documentEncoding = encoding;
+  }
+
+  /**
+   * convenience method which downcasts the chars in str to a byte
+   * array without attempting to decode them.
+   */
+  private byte[] stringToBytes(String str) {
+    char[] cbuf = str.toCharArray();
+    byte[] buf = new byte[cbuf.length];
+    for (int i = 0; i < cbuf.length; i++) {
+      buf[i] = (byte)cbuf[i];
+    }
+    return buf;
+  }
+}
+ 
+PARSER_END(RTFParser)
+
+/*************************************/
+/* lexical specification begins here */
+/*************************************/
+
+// backslash introduces a control, sending us into that lexical state.
+// backslash followed by single quote introduces a hex-encoded character,
+// which we process in the HEX state.  This allows us to distinguish
+// hex-escaped characters from RTF controls on the basis of a string
+// literal (using the parser's DFA) rather than a regular expression
+// (which uses the parser's NFA).
+// see <http://www.cs.albany.edu/~sreeni/JavaCC/lexertips.html>
+// for details on this topic.
+<*>
+MORE:
+{
+  <BACKSLASH: "\\"> : CONTROL
+}
+
+<*>
+MORE:
+{
+  <HEX_ESCAPE: "\\'"> : HEX
+}
+
+// newlines and tab literals are ignored in the default state.
+// Note that space characters are *not* ignored, since they are text.
+<DEFAULT>
+SKIP :
+{
+  "\n"
+| "\r"
+| "\t"
+}
+
+// braces begin/end a group and put us into the DEFAULT lexical state
+<*>
+TOKEN:
+{
+  <LBRACE: "{"> : DEFAULT
+| <RBRACE: "}"> : DEFAULT
+}
+
+// apart from {, }, and \, everything else (less skipped whitespace) in 
+// the DEFAULT state is (1) text; or (2) a control symbol.
+
+// these control symbol literals are escaped special characters
+<DEFAULT>
+TOKEN:
+{
+  <NON_BREAKING_SPACE: "\\~">  { matchedToken.image = "\u00A0"; }
+| <OPTIONAL_HYPHEN: "\\-">     { matchedToken.image = "\u00ad"; }
+| <NON_BREAKING_HYPHEN: "\\_"> { matchedToken.image = "\u2011"; }
+}
+
+// the RTF spec allows writers to emit a backslash (newline|carriage return)
+// token and requires us to treat it as a \par token.  See the fine
+// print on page 89 of the 1.6 spec.
+// Since \par is user-configurable (see setNewLine()), we cannot change
+// the matched value here; rather, this is done in the parser, which has
+// access to the user's EOL String.
+<DEFAULT>
+TOKEN:
+{
+  <ESCAPED_NEWLINE: "\\\n">
+| <ESCAPED_CARRIAGE_RETURN: "\\\r">
+}
+
+// these control symbol literals are not handled in this parser (except
+// to pass them onto the delegate), but we specify them as literals
+// so that they can be matched quickly.
+<DEFAULT>
+TOKEN:
+{
+  <IGNORABLE_DESTINATION: "\\*">
+| <FORMULA_CHARACTER: "\\|">
+| <INDEX_SUBENTRY: "\\:">
+}
+
+// escaped braces and backslashes are text
+<DEFAULT>
+TOKEN:
+{
+  <ESCAPED_LBRACE: "\\{">     { matchedToken.image = "{"; }
+| <ESCAPED_RBRACE: "\\}">     { matchedToken.image = "}"; }
+| <ESCAPED_BACKSLASH: "\\\\"> { matchedToken.image = "\\"; }
+}
+
+// the patterns for matching control symbols that we forward to the delegate
+// and text.
+<DEFAULT>
+TOKEN:
+{
+  <CONTROL_SYM: "\\" ~["a"-"z", "A"-"Z", "0"-"9", " ",
+                       "\n", "\r", "\t", "}", "{", "\\"]>
+| <TEXT: (~["\\","{","}","\n","\r", "\t"])+>
+}
+
+// end of DEFAULT lexical state specification
+
+// we handle hex characters in their own lexical state, with a single
+// pattern.  When matched, this gets combined with the MORE \' match
+// that sent us into this state to begin with.  We don't use the
+// CONTROL state because the hex characters mean something else there.
+<HEX>
+TOKEN:
+{
+  <#HEX_DIGIT: ["0"-"9","a"-"f","A"-"F"]>
+| <HEX_CHAR: <HEX_DIGIT> <HEX_DIGIT>> : DEFAULT
+}
+
+// end of HEX lexical state specification
+
+// In the CONTROL state, whitespace is semantically meaningless; 
+// syntactically, however, it marks the end of whatever control we're
+// lexing, putting us back in the DEFAULT state.
+//
+// For example: given input like this: "\control1 \control2" the intervening
+// space is not semantically part of either control token, but it does
+// delimit them.
+//
+// N.B. This input - "\control1    \control2" - is different. The first
+// intervening space is ignorable.  The subsequent spaces, however, are
+// text and must be accumulated in a TEXT token in the DEFAULT state.
+<CONTROL>
+SKIP:
+{
+  " " : DEFAULT
+| "\n" : DEFAULT
+| "\r" : DEFAULT
+| "\t" : DEFAULT
+}
+ 
+/* Unicode character value control word literal */
+<CONTROL>
+TOKEN:
+{
+  <U: "u">
+}
+
+/* Unicode skipping directive control word literal */
+<CONTROL>
+TOKEN:
+{
+ <UC: "uc">
+}
+
+/* style, font, and font charset control word literals */
+<CONTROL>
+TOKEN:
+{
+  <F: "f">
+| <CS: "cs">
+| <FCHARSET: "fcharset">
+| <PLAIN: "plain">
+}
+
+/* Document encoding control word literals */
+<CONTROL>
+TOKEN:
+{
+  <PC: "pc">
+| <PCA: "pca">
+| <MAC: "mac">
+| <RTF: "rtf">
+| <ANSI: "ansi">
+| <ANSICPG: "ansicpg">
+| <DEFF: "deff">
+}
+
+/* Document table declaration control word literals */
+<CONTROL>
+TOKEN:
+{
+  <INFO: "info">
+| <REVTBL: "revtbl">
+| <PNTEXT: "pntext">
+| <FONTTBL: "fonttbl">
+| <COLORTBL: "colortbl">
+| <PNSECLVL: "pnseclvl">
+| <LISTTABLE: "listtable">
+| <STYLESHEET: "stylesheet">
+}
+
+/* control word literals which designate special characters */
+<CONTROL>
+TOKEN:
+{
+  <TAB: "tab"> 
+| <ZWJ: "zwj"> 
+| <ZWNJ: "zwnj"> 
+| <PAR: "par"> 
+| <LINE: "line">
+| <EMDASH: "emdash"> 
+| <ENDASH: "endash"> 
+| <EMSPACE: "emspace"> 
+| <ENSPACE: "enspace"> 
+| <BULLET: "bullet"> 
+| <LQUOTE: "lquote"> 
+| <RQUOTE: "rquote"> 
+| <LTRMARK: "ltrmark"> 
+| <RTLMARK: "rtlmark"> 
+| <LDBLQUOTE: "ldblquote"> 
+| <RDBLQUOTE: "rdblquote"> 
+}
+
+/* the "exceptional" set of control words - these are the words that
+ * have mixed case, depsite the requirement in RTF that control words
+ * be lower case ascii.  */
+<CONTROL>
+TOKEN:
+{
+  <CLFITTEXT: "clFitText">
+| <CLFTSWIDTH: "clftsWidth">
+| <CLNOWRAP: "clNoWrap">
+| <CLWWIDTH: "clwWidth">
+| <TDFRMTXTBOTTOM: "tdfrmtxtBottom">
+| <TDFRMTXTLEFT: "tdfrmtxtLeft">
+| <TDFRMTXTRIGHT: "tdfrmtxtRight">
+| <TDFRMTXTTOP: "tdfrmtxtTop">
+| <TRFTSWIDTHA: "trftsWidthA">
+| <TRFTSWIDTHB: "trftsWidthB">
+| <TRFTSWIDTH: "trftsWidth">
+| <TRWWIDTHA: "trwWidthA">
+| <TRWWIDTHB: "trwWidthB">
+| <TRWWIDTH: "trwWidth">
+| <SECTSPECIFYGENN: "sectspecifygenN">
+}
+
+/* control words which we don't handle (but which we forward to the
+ * delegate nonetheless).
+ */
+<CONTROL>
+TOKEN:
+{
+  <#LC_LETTER: ["a"-"z"]>
+| <CONTROL_WORD: (<LC_LETTER> | ["S","B","N"])+>
+}
+
+/* control parameters: note that they may be negative values */
+<CONTROL>
+TOKEN:
+{
+  <#DIGIT: ["0"-"9"]>
+| <CW_VAL: (["-"])? (<DIGIT>)+>
+}
+
+/* any character which wasn't matched as part of a control word or its
+ * value terminates the control (sending us back into the DEFAULT state)
+ * but is not actually part of the control (and so can be skipped).
+ */
+<CONTROL>
+SKIP:
+{
+ <~[]> : DEFAULT
+}
+
+// end of CONTROL lexical specification
+
+/**************************************/
+/* grammatical productions begin here */
+/**************************************/
+
+/**
+ *  Sends the parser delegate a block of unicode text along with
+ *  the name of the style in which it was found and the location
+ *  in the document where it occurred.
+ *  All text encoding is resolved here so the delegate doesn't need
+ *  to concern itself with the various ways in which RTF encodes
+ *  non-ASCII strings.
+ */
+void text() throws UnsupportedEncodingException :
+{
+  StringBuffer buf = new StringBuffer();
+  StringBuffer cbuf = new StringBuffer();
+  ByteArrayOutputStream baos = new ByteArrayOutputStream();
+  byte b;
+  byte[] raw;
+}
+{
+  (
+    (
+        u(cbuf) raw=skip_after_unicode()
+      {
+        if (raw != null) {
+          cbuf.append(new String(raw, getCurrentEncoding()));
+        }
+      }
+      | escaped(cbuf)
+      | special_character(cbuf)
+      | textual_control_symbol(cbuf)
+    ) {
+      if (baos.size() > 0) {
+        buf.append(baos.toString(getCurrentEncoding()));
+        baos.reset();
+      }
+      buf.append(cbuf.toString());
+      cbuf.setLength(0);
+    }
+    | b=hex()        { baos.write(b); }
+    | raw=raw_text() { baos.write(raw,0,raw.length); }
+  )+ 
+  {
+    if (baos.size() > 0) {
+      buf.append(baos.toString(getCurrentEncoding()));
+      baos.reset();
+    }
+    if (_where == IN_STYLESHEET) {
+      _styleMap.put(new Integer(_currentStyleValue), buf.toString());
+    }
+    _delegate.text(buf.toString(), getCurrentStyle(), _where);
+  }
+}
+
+byte[] raw_text() throws UnsupportedEncodingException :
+{
+  Token tok;
+}
+{
+  tok=<TEXT> { return stringToBytes(tok.image); }
+}
+
+void escaped(StringBuffer buf) :
+{
+  Token tok;
+}
+{
+  (
+      tok=<ESCAPED_BACKSLASH>
+    | tok=<ESCAPED_LBRACE>
+    | tok=<ESCAPED_RBRACE>
+  ) {
+    buf.append(tok.image.charAt(0));
+  }
+}
+
+void textual_control_symbol(StringBuffer buf) :
+{
+  Token tok;
+}
+{
+  (
+    tok=<NON_BREAKING_SPACE>
+  | tok=<OPTIONAL_HYPHEN>
+  | tok=<NON_BREAKING_HYPHEN> 
+  ) {
+    buf.append(tok.image);
+  }
+}
+
+byte hex() :
+{
+  Token hex;
+}
+{
+  hex=<HEX_CHAR>
+  {
+    byte b = (byte)Integer.parseInt(hex.image.substring(2), 16);
+    return b;
+  }
+}
+
+void special_character(StringBuffer buf) :
+{
+}
+{
+  (
+    (
+        <LINE>       { buf.append('\r'); }
+      | <TAB>        { buf.append('\t'); }
+      | <EMDASH>     { buf.append('\u2014'); }
+      | <ENDASH>     { buf.append('\u2013'); }
+      | <EMSPACE>    { buf.append('\u2003'); }
+      | <ENSPACE>    { buf.append('\u0020'); }
+      | <BULLET>     { buf.append('\u2022'); }
+      | <LQUOTE>     { buf.append('\u2018'); }
+      | <RQUOTE>     { buf.append('\u2019'); }
+      | <LDBLQUOTE>  { buf.append('\u201c'); }
+      | <RDBLQUOTE>  { buf.append('\u201d'); }
+      | <LTRMARK>    { buf.append('\u200e'); }
+      | <RTLMARK>    { buf.append('\u200f'); }
+      | <ZWJ>        { buf.append('\u200d'); }
+      | <ZWNJ>       { buf.append('\u200c'); }
+      | (   <PAR>
+          | <ESCAPED_NEWLINE>
+          | <ESCAPED_CARRIAGE_RETURN>
+        ) { buf.append(getNewLine()); }
+    )
+  )
+}
+
+void lbrace() :
+{
+}
+{
+  <LBRACE>
+  {
+    _fontEncodingStack.push(getCurrentEncoding());
+    _ucSkipStates.push(getCurrentSkipState());
+    _styleStack.push(getCurrentStyle());
+    _delegate.openGroup(++_braceDepth);
+  }
+}
+
+void rbrace() :
+{
+}
+{
+  <RBRACE>
+  {
+    setCurrentSkipState((Integer)_ucSkipStates.pop());
+    setCurrentEncoding((String)_fontEncodingStack.pop());
+    setCurrentStyle((String)_styleStack.pop());
+    _delegate.closeGroup(_braceDepth);
+    if (1 == --_braceDepth) { // leaving a table
+      if (_where == IN_STYLESHEET) {
+        _delegate.styleList(new ArrayList(_styleMap.values()));
+      }
+      _where = IN_DOCUMENT;
+    }
+  }
+}
+
+void table_declaration() :
+{
+}
+{
+  (
+      <INFO>                   { _where = IN_INFO; }
+    | <FONTTBL>                { _where = IN_FONTTBL; }
+    | <COLORTBL>               { _where = IN_COLORTBL; }
+    | <STYLESHEET>             { _where = IN_STYLESHEET; }
+    | <LISTTABLE>              { _where = IN_LISTTABLE; }
+    | <REVTBL>                 { _where = IN_REVTBL; }
+    | <PNTEXT> [ <CW_VAL> ]    { _where = IN_PNTEXT; }
+    | <PNSECLVL> [ <CW_VAL> ]  { _where = IN_PNTEXT; }
+  )
+}
+
+void control_symbol() :
+{
+  Token sym = null;
+}
+{
+  (
+    sym=<CONTROL_SYM>
+  | sym=<IGNORABLE_DESTINATION>
+  | sym=<FORMULA_CHARACTER>
+  | sym=<INDEX_SUBENTRY>
+  ) {
+    _delegate.controlSymbol(sym.image, _where);
+  }
+}
+
+Token mixed_case_control_word() :
+{
+  Token word = null;
+}
+{
+  (
+    word=<CLFITTEXT> |
+    word=<CLFTSWIDTH> |
+    word=<CLNOWRAP> |
+    word=<CLWWIDTH> |
+    word=<TDFRMTXTBOTTOM> |
+    word=<TDFRMTXTLEFT> |
+    word=<TDFRMTXTRIGHT> |
+    word=<TDFRMTXTTOP> |
+    word=<TRFTSWIDTHA> |
+    word=<TRFTSWIDTHB> |
+    word=<TRFTSWIDTH> |
+    word=<TRWWIDTHA> |
+    word=<TRWWIDTHB> |
+    word=<TRWWIDTH> |
+    word=<SECTSPECIFYGENN>
+  ) { return word; }
+}
+
+void control_word() :
+{
+  Token word = null, val = null;
+}
+{
+  (word=<CONTROL_WORD> | word=mixed_case_control_word())
+  [ val=<CW_VAL> ] 
+  {
+    int v = null == val ? 0 : Integer.parseInt(val.image);
+    _delegate.controlWord(word.image, v, _where);
+  }
+}
+
+void u(StringBuffer buf) :
+{
+  Token val;
+}
+{
+  <U>
+  val=<CW_VAL> {
+    int ucValue = Integer.parseInt(val.image);
+    // correct RTF negative unicode char value
+    if (ucValue < 0) {
+      ucValue += 65536;
+    }
+    buf.append((char)ucValue);
+  }
+}
+
+JAVACODE
+byte[] skip_after_unicode() throws UnsupportedEncodingException {
+  Token tok;
+  byte[] raw = null;
+
+  for (int skip = getCurrentSkipState().intValue(); skip != 0; skip--) {
+    tok = getNextToken();
+    switch (tok.kind) {
+    case HEX_CHAR:
+      break; // buh bye!
+    case TEXT:
+      if (tok.image.length() > skip) {
+         byte[] tmp = stringToBytes(tok.image);
+         raw = new byte[ tmp.length - skip ];
+         System.arraycopy(tmp,skip,raw,0,raw.length);
+         return raw;
+      }
+      break; // the text was exactly what we needed: buh bye!
+    default:
+      throw new IllegalStateException("unexpected token while skipping");
+    }
+  }
+  return raw;
+}
+
+void uc() :
+{
+  Token word = null, val = null;
+}
+{
+  word=<UC>
+  val=<CW_VAL>
+  {
+    int bytesToSkip = null == val ? 0 : Integer.parseInt(val.image);
+    setCurrentSkipState(new Integer(bytesToSkip));
+  }
+}
+
+void fcharset() :
+{
+  Token word = null, val = null;
+}
+{
+  word=<FCHARSET>
+  val=<CW_VAL>
+  {
+    int charset = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_FONTTBL == _where) {
+      // Modified: always use _documentEncoding
+      _fontEncodingMap.put(new Integer(_currentFontValue), 
+                           /*CHARSET_ENCODING_TABLE[charset]*/_documentEncoding);
+    } else {
+      // this shouldn't happen -- forward onto delegate?
+    }
+  }
+}
+
+void deff() :
+{
+  Token val = null;
+}
+{
+  <DEFF>
+  val=<CW_VAL>
+  {
+    // _defaultFont = null == val ? 0 : Integer.parseInt(val.image);
+    // need to figure out if this really has to be handled.
+  }
+}
+
+void f() :
+{
+  Token val = null;
+}
+{
+  <F>
+  val=<CW_VAL>
+  {
+    int font = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_FONTTBL == _where) {
+      _currentFontValue = font;
+    } else if (IN_DOCUMENT == _where) {
+      String encoding = (String)_fontEncodingMap.get(new Integer(font));
+      setCurrentEncoding(null == encoding ? DEFAULT_ENCODING : encoding);
+    } else {
+      // consume this font event
+    }
+  }
+}
+
+void cs() :
+{
+  Token val = null;
+}
+{
+  <CS>
+  val=<CW_VAL>
+  {
+    int style = null == val ? 0 : Integer.parseInt(val.image);
+    if (IN_STYLESHEET == _where) {
+      _currentStyleValue = style;
+    } else if (IN_DOCUMENT == _where) {
+      setCurrentStyle((String)_styleMap.get(new Integer(style)));
+    } else {
+      // consume this style event
+    }
+  }
+}
+
+void plain() :
+{
+}
+{
+  <PLAIN> { setCurrentStyle(NO_STYLE); }
+}
+
+
+/* these productions identify the document encoding; note that they
+ * are almost always clobbered by an \ansicpg or by unicode characters */
+void document_charset() :
+{
+}
+{
+  (
+      <PC>  { setDocumentEncoding(getJavaEncoding(437)); }
+    | <PCA> { setDocumentEncoding(getJavaEncoding(850)); }
+    | <MAC> { setDocumentEncoding("MacRoman"); }
+    | <ANSI> { setDocumentEncoding(getJavaEncoding(1252)); }
+  )
+}
+
+/* specifies the ANSI codepage to use as the document's encoding. Subject
+ * to local overrides. */
+void ansicpg() :
+{
+  Token val = null;
+}
+{
+  <ANSICPG>
+  val=<CW_VAL>
+  {
+    // must be a value in the map - we should throw if it isn't there.
+    int cp = null == val ? 0 : Integer.parseInt(val.image);
+    setDocumentEncoding(getJavaEncoding(cp));
+    setCurrentEncoding(getJavaEncoding(cp)); /* Modified: added this line */
+  }
+}
+
+// TODO: consider collecting special characters in a buffer
+
+void group() throws UnsupportedEncodingException :
+{
+}
+{
+  lbrace()
+  (
+      table_declaration() // fonttbl, filetbl, info, stylesheet, etc.
+    | uc()
+    | f()
+    | fcharset()
+    | cs()
+    | plain()
+    | control_word() // this is the catch-all for controls we don't 
+                     // explicitly handle in the grammar.
+    | control_symbol()
+    | group()
+    | text()
+  )+
+  rbrace()
+}
+
+void document() throws UnsupportedEncodingException :
+{
+}
+{
+  { _delegate.startDocument(); }
+  lbrace()
+  <RTF> <CW_VAL>
+  document_charset()
+
+  // maddeningly word behaves inconsistently w/respect to
+  // the placement of these next two productions, even though the
+  // RTF spec is quite clear on the subject.  So, we are forced
+  // to put them into this "anything goes" grouping, even though
+  // ansicpg() and deff() should appear 0..1 times at the beginning
+  // of the document.
+  (
+      uc()
+    | ansicpg()
+    | deff()
+  )*
+  (
+      uc()
+    | f()
+    | cs()
+    | plain()
+    | control_word() // this is the catch-all for controls we don't 
+                     // explicitly handle in the grammar.
+    | control_symbol()
+    | group()
+    | text()
+  )+
+  rbrace()
+  //<EOF>               // Modified: commented out because there are non-correct RTFs
+  { _delegate.endDocument(); }
+}
Index: src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
===================================================================
--- src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java	(revision 747309)
+++ src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java	(working copy)
@@ -28,18 +28,13 @@
 import org.apache.nutch.metadata.DublinCore;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
 
-// RTF Parser imports
-import com.etranslate.tm.processing.rtf.ParseException;
-import com.etranslate.tm.processing.rtf.RTFParser;
-
-
 /**
  * A parser for RTF documents
  * 
@@ -49,7 +44,7 @@
 
   private Configuration conf;
 
-  public Parse getParse(Content content) {
+  public ParseResult getParse(Content content) {
     byte[] raw = content.getContent();
     Reader reader = new InputStreamReader(new ByteArrayInputStream(raw));
     RTFParserDelegateImpl delegate = new RTFParserDelegateImpl();
@@ -61,9 +56,8 @@
     try {
       rtfParser.parse();
     } catch (ParseException e) {
-        return new ParseStatus(ParseStatus.FAILED,
-                               ParseStatus.FAILED_EXCEPTION,
-                               e.toString()).getEmptyParse(conf);
+      return new ParseStatus(ParseStatus.FAILED, "Can't be handled as RTF document. " + e)
+                 .getEmptyParseResult(content.getUrl(), conf);
     }
 
     Metadata metadata = new Metadata();
@@ -78,13 +72,14 @@
 
     String text = delegate.getText();
 
-    return new ParseImpl(text,
-                         new ParseData(ParseStatus.STATUS_SUCCESS,
-                                       title,
-                                       OutlinkExtractor
-        .                              getOutlinks(text, this.conf),
-                                       content.getMetadata(),
-                                       metadata));
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+                                        title,
+                                        OutlinkExtractor.getOutlinks(text, this.conf),
+                                        content.getMetadata(),
+                                        metadata);
+                                       
+    return ParseResult.createParseResult(content.getUrl(), 
+                                         new ParseImpl(text, parseData));
   }
 
   public void setConf(Configuration conf) {
Index: src/plugin/parse-rtf/build.xml
===================================================================
--- src/plugin/parse-rtf/build.xml	(revision 747281)
+++ src/plugin/parse-rtf/build.xml	(working copy)
@@ -31,35 +31,10 @@
   </target>
   
   
-  <property name="rtf-src.jar" value="tmp/rtf_parser_src.jar"/>
-  <property name="rtf-parser.jar" value="lib/rtf-parser.jar"/>
-   
-  <available file="${rtf-src.jar}" property="rtf-src.jar.available"/>
-  <available file="${rtf-parser.jar}" property="rtf-parser.available"/>
-   
-  <target name="download-rtf-src" unless="rtf-src.jar.available">
-      <mkdir dir="tmp"/>
-    <get src="http://www.cobase.cs.ucla.edu/pub/javacc/rtf_parser_src.jar"
-              dest="tmp/rtf_parser_src.jar"/>    
-  </target>
-   
-  <target name="build-rtf-parser" 
-          depends="download-rtf-src"
-          unless="rtf-parser.available">
-      <unjar src="${rtf-src.jar}"
-            dest="tmp"/>
-      <javacc target="tmp/rtf/RTFParser.jj"
+  <target name="build-rtf-parser">
+      <javacc target="src/java/org/apache/nutch/parse/rtf/RTFParser.jj"
         javacchome="${javacc.home}">
       </javacc>
-   
-      <mkdir dir="tmp/classes"/>
-      <javac srcdir="tmp" destdir="tmp/classes"/>
-      <jar destfile="${rtf-parser.jar}" basedir="tmp/classes"/>
-        <delete>
-          <fileset dir="tmp">
-            <exclude name="*.jar"/>
-          </fileset>
-        </delete>
   </target>
    
   <target name="init-plugin" depends="build-rtf-parser">