Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 179145)
+++ conf/nutch-default.xml	(working copy)
@@ -578,7 +578,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
+  <value>protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.  By
   default Nutch includes crawling just HTML and plain text via HTTP,
@@ -600,6 +600,13 @@
   is available</description>
 </property>
 
+<property>
+  <name>parser.html.impl</name>
+  <value>neko</value>
+  <description>HTML Parser implementation. Currently the following keywords
+  are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
+  </description>
+
 <!-- urlfilter plugin properties -->
 
 <property>
Index: src/test/org/apache/nutch/fetcher/TestFetcherOutput.java
===================================================================
--- src/test/org/apache/nutch/fetcher/TestFetcherOutput.java	(revision 179145)
+++ src/test/org/apache/nutch/fetcher/TestFetcherOutput.java	(working copy)
@@ -19,6 +19,8 @@
 import java.io.*;
 import org.apache.nutch.io.*;
 import org.apache.nutch.pagedb.*;
+import org.apache.nutch.protocol.ProtocolStatus;
+
 import junit.framework.TestCase;
 
 /** Unit tests for FetcherOutput. */
@@ -32,7 +34,7 @@
     FetcherOutput o =
       new FetcherOutput(new FetchListEntry(true, TestPage.getTestPage(),
                                            anchors),
-                        TestMD5Hash.getTestHash(), FetcherOutput.SUCCESS);
+                        TestMD5Hash.getTestHash(), ProtocolStatus.STATUS_SUCCESS);
                         
     TestWritable.testWritable(o);
 
Index: src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
===================================================================
--- src/test/org/apache/nutch/tools/TestSegmentMergeTool.java	(revision 179145)
+++ src/test/org/apache/nutch/tools/TestSegmentMergeTool.java	(working copy)
@@ -22,17 +22,17 @@
 
 import org.apache.nutch.db.Page;
 import org.apache.nutch.fetcher.FetcherOutput;
-import org.apache.nutch.io.ArrayFile;
 import org.apache.nutch.io.MD5Hash;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.segment.SegmentReader;
 import org.apache.nutch.segment.SegmentWriter;
-import org.apache.nutch.util.*;
 import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import junit.framework.TestCase;
 
@@ -81,7 +81,7 @@
       }
       url += "/example.html";
       FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String[] { "test" + rnd });
-      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS);
+      FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), ProtocolStatus.STATUS_SUCCESS);
       StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>");
       if (unique) {
         content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
@@ -95,7 +95,7 @@
       meta.setProperty("Host", "http://localhost");
       meta.setProperty("Connection", "Keep-alive, close");
       Content co = new Content(url, "http://www.example.com", content.toString().getBytes("UTF-8"), "text/html", meta);
-      ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta);
+      ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, "Hello from Page " + i, new Outlink[0], meta);
       StringBuffer text = new StringBuffer("Hello from Page" + i);
       if (unique) {
         text.append("\nCreated at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong());
Index: src/test/org/apache/nutch/parse/TestParseData.java
===================================================================
--- src/test/org/apache/nutch/parse/TestParseData.java	(revision 179145)
+++ src/test/org/apache/nutch/parse/TestParseData.java	(working copy)
@@ -40,7 +40,7 @@
     metaData.put("Language", "en/us");
     metaData.put("Charset", "UTF-8");
 
-    ParseData r = new ParseData(title, outlinks, metaData);
+    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                         
     TestWritable.testWritable(r);
   }
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 179145)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -20,6 +20,7 @@
 import java.io.File;
 import java.util.Properties;
 
+import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.io.*;
 import org.apache.nutch.db.*;
@@ -68,6 +69,10 @@
 
   private int threadCount =                       // max number of threads
     NutchConf.get().getInt("fetcher.threads.fetch", 10);
+  private static final float NEW_INJECTED_PAGE_SCORE =
+    NutchConf.get().getFloat("db.score.injected", 2.0f);
+  private static final int MAX_REDIRECT =
+    NutchConf.get().getInt("http.redirect.max", 3);
 
   // All threads (FetcherThread or thread started by it) belong to
   // group "fetcher". Each FetcherThread is named as "fetcherXX",
@@ -110,45 +115,84 @@
           if (!fle.getFetch()) {                  // should we fetch this page?
             if (LOG.isLoggable(Level.FINE))
               LOG.fine("not fetching " + url);
-            handleNoFetch(fle, FetcherOutput.SUCCESS);
+            handleNoFetch(fle, ProtocolStatus.STATUS_NOTFETCHING);
             continue;
           }
 
-          LOG.info("fetching " + url);            // fetch the page
-
-          Protocol protocol = ProtocolFactory.getProtocol(url);
-          Content content = protocol.getContent(url);
-
-          handleFetch(url, fle, content);
-
-          synchronized (Fetcher.this) {           // update status
-            pages++;
-            bytes += content.getContent().length;
-            if ((pages % 100) == 0) {             // show status every 100pp
-              status();
+          // support multiple redirects, if requested by protocol
+          // or content meta-tags (the latter requires running Fetcher
+          // in parsing mode). Protocol-level redirects take precedence over
+          // content-level redirects. Some plugins can handle redirects
+          // automatically, so that only the final success or failure will be
+          // shown here.
+          boolean refetch = false;
+          int redirCnt = 0;
+          do {
+            LOG.fine("redirCnt=" + redirCnt);
+            refetch = false;
+            LOG.info("fetching " + url);            // fetch the page
+            Protocol protocol = ProtocolFactory.getProtocol(url);
+            ProtocolOutput output = protocol.getProtocolOutput(fle);
+            ProtocolStatus pstat = output.getStatus();
+            Content content = output.getContent();
+            switch(pstat.getCode()) {
+              case ProtocolStatus.SUCCESS:
+                if (content != null) {
+                  synchronized (Fetcher.this) {           // update status
+                    pages++;
+                    bytes += content.getContent().length;
+                    if ((pages % 100) == 0) {             // show status every 100pp
+                      status();
+                    }
+                  }
+                  ParseStatus ps = handleFetch(url, fle, output);
+                  if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+                    url = ps.getMessage();
+                    url = URLFilters.filter(url);
+                    if (url != null) {
+                      refetch = true;
+                      redirCnt++;
+                      fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+                      LOG.info(" - content redirect to " + url);
+                    }
+                  }
+                }
+                break;
+              case ProtocolStatus.MOVED: // try to redirect immediately
+              case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
+                // record the redirect. perhaps the DB will want to know this.
+                handleNoFetch(fle, pstat);
+                url = pstat.getMessage();
+                if (url != null) {
+                  refetch = true;
+                  redirCnt++;
+                  // create new entry.
+                  fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+                  LOG.info(" - protocol redirect to " + url);
+                }
+                break;
+              case ProtocolStatus.GONE:
+              case ProtocolStatus.NOT_FOUND:
+              case ProtocolStatus.ACCESS_DENIED:
+              case ProtocolStatus.ROBOTS_DENIED:
+              case ProtocolStatus.RETRY:
+              case ProtocolStatus.NOTMODIFIED:
+                handleNoFetch(fle, pstat);
+                break;
+              case ProtocolStatus.EXCEPTION:
+                logError(url, fle, new Exception(pstat.getMessage()));                // retry?
+                handleNoFetch(fle, pstat);
+              break;
+              default:
+                LOG.warning("Unknown ProtocolStatus: " + pstat.getCode());
+                handleNoFetch(fle, pstat);
             }
-          }
-        } catch (ResourceGone e) {                // don't retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.NOT_FOUND);
+          } while (refetch && (redirCnt < MAX_REDIRECT));
 
-        // dealt with in handleFetch() below
-        //} catch (ParseException e) {              // don't retry
-        //  logError(url, fle, e);
-        //  handleNoFetch(fle, FetcherOutput.CANT_PARSE);
-
-        } catch (RetryLater e) {                  // explicit retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.RETRY);
-
-        } catch (ProtocolException e) {           // implicit retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.RETRY);
-
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, fle, t);                // retry?
-            handleNoFetch(fle, FetcherOutput.RETRY);
+            handleNoFetch(fle, new ProtocolStatus(t));
           }
         }
       }
@@ -176,36 +220,44 @@
       }
     }
 
-    private void handleFetch(String url, FetchListEntry fle, Content content) {
+    private ParseStatus handleFetch(String url, FetchListEntry fle, ProtocolOutput output) {
+      Content content = output.getContent();
+      ProtocolStatus protocolStatus = output.getStatus();
       if (!Fetcher.this.parsing) {
         outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.SUCCESS),
+                protocolStatus),
                 content, null, null);
-        return;
+        return null;
       }
 
-      try {
         String contentType = content.getContentType();
-        Parser parser = ParserFactory.getParser(contentType, url);
-        Parse parse = parser.getParse(content);
-        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.SUCCESS),
-                content, new ParseText(parse.getText()), parse.getData());
-      } catch (ParseException e) {
-        // 20041026, xing
-        // If fetching succeeds, but parsing fails, content should be saved
-        // so that we can try to parse again in separate pass, possibly
-        // using better/alternative parser.
-        LOG.info("fetch okay, but can't parse " + url + ", reason: "
-          + e.getMessage());
-        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.CANT_PARSE),
-                content, new ParseText(""),
-                new ParseData("", new Outlink[0], new Properties()));
-      }
+        Parser parser = null;
+        Parse parse = null;
+        ParseStatus status = null;
+        try {
+          parser = ParserFactory.getParser(contentType, url);
+          parse = parser.getParse(content);
+          status = parse.getData().getStatus();
+        } catch (Exception e) {
+          e.printStackTrace();
+          status = new ParseStatus(e);
+        }
+        if (status.isSuccess()) {
+          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+                  protocolStatus),
+                  content, new ParseText(parse.getText()), parse.getData());
+        } else {
+          LOG.info("fetch okay, but can't parse " + url + ", reason: "
+                  + status.toString());
+          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+                  protocolStatus),
+                  content, new ParseText(""),
+                  new ParseData(status, "", new Outlink[0], new Properties()));
+        }
+        return status;
     }
 
-    private void handleNoFetch(FetchListEntry fle, int status) {
+    private void handleNoFetch(FetchListEntry fle, ProtocolStatus status) {
       String url = fle.getPage().getURL().toString();
       MD5Hash hash = MD5Hash.digest(url);
 
@@ -213,7 +265,7 @@
         outputPage(new FetcherOutput(fle, hash, status),
                    new Content(url, url, new byte[0], "", new Properties()),
                    new ParseText(""),
-                   new ParseData("", new Outlink[0], new Properties()));
+                   new ParseData(ParseStatus.STATUS_NOTPARSED, "", new Outlink[0], new Properties()));
       } else {
         outputPage(new FetcherOutput(fle, hash, status),
                    new Content(url, url, new byte[0], "", new Properties()),
@@ -234,6 +286,7 @@
         }
       } catch (Throwable t) {
         LOG.severe("error writing output:" + t.toString());
+        t.printStackTrace();
       }
     }
                                        
@@ -429,7 +482,7 @@
     }
 
     // set log level
-    fetcher.setLogLevel(Level.parse(logLevel.toUpperCase()));
+    setLogLevel(Level.parse(logLevel.toUpperCase()));
 
     if (showThreadID) {
       LogFormatter.setShowThreadIDs(showThreadID);
Index: src/java/org/apache/nutch/fetcher/FetcherOutput.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherOutput.java	(revision 179145)
+++ src/java/org/apache/nutch/fetcher/FetcherOutput.java	(working copy)
@@ -26,6 +26,8 @@
 import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 /*********************************************
  * An entry in the fetcher's output.  This includes all of the fetcher output
@@ -50,25 +52,34 @@
   public static final String DONE_NAME = "fetcher.done";
   public static final String ERROR_NAME = "fetcher.error";
 
-  private final static byte VERSION = 4;
+  private final static byte VERSION = 5;
 
-  public final static byte RETRY = 0;
-  public final static byte SUCCESS = 1;
-  public final static byte NOT_FOUND = 2;
-  public final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+  // backwards compatibility codes
+  private final static byte RETRY = 0;
+  private final static byte SUCCESS = 1;
+  private final static byte NOT_FOUND = 2;
+  private final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+  
+  private static final byte[] oldToNewMap = {
+          ProtocolStatus.RETRY,
+          ProtocolStatus.SUCCESS,
+          ProtocolStatus.NOT_FOUND,
+          ProtocolStatus.FAILED,
+          ProtocolStatus.RETRY
+  };
 
   private FetchListEntry fetchListEntry;
   private MD5Hash md5Hash;
-  private int status;
+  private ProtocolStatus protocolStatus;
   private long fetchDate;
 
   public FetcherOutput() {}
 
   public FetcherOutput(FetchListEntry fetchListEntry,
-                       MD5Hash md5Hash, int status) {
+                       MD5Hash md5Hash, ProtocolStatus protocolStatus) {
     this.fetchListEntry = fetchListEntry;
     this.md5Hash = md5Hash;
-    this.status = status;
+    this.protocolStatus = protocolStatus;
     this.fetchDate = System.currentTimeMillis();
   }
 
@@ -78,7 +89,12 @@
     byte version = in.readByte();                 // read version
     fetchListEntry = FetchListEntry.read(in);
     md5Hash = MD5Hash.read(in);
-    status = in.readByte();
+    if (version < 5) {
+      int status = in.readByte();
+      protocolStatus = new ProtocolStatus(oldToNewMap[status]);
+    } else {
+      protocolStatus = ProtocolStatus.read(in);
+    }
 
     if (version < 4) {
       UTF8.readString(in);                        // read & ignore title
@@ -95,7 +111,7 @@
     out.writeByte(VERSION);                       // store current version
     fetchListEntry.write(out);
     md5Hash.write(out);
-    out.writeByte(status);
+    protocolStatus.write(out);
     out.writeLong(fetchDate);
   }
 
@@ -110,8 +126,8 @@
   //
   public FetchListEntry getFetchListEntry() { return fetchListEntry; }
   public MD5Hash getMD5Hash() { return md5Hash; }
-  public int getStatus() { return status; }
-  public void setStatus(int status) { this.status = status; }
+  public ProtocolStatus getProtocolStatus() { return protocolStatus; }
+  public void setProtocolStatus(ProtocolStatus protocolStatus) { this.protocolStatus = protocolStatus; }
   public long getFetchDate() { return fetchDate; }
   public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }
 
@@ -126,7 +142,7 @@
     return
       this.fetchListEntry.equals(other.fetchListEntry) &&
       this.md5Hash.equals(other.md5Hash) &&
-      (this.status == other.status);
+      this.protocolStatus.equals(other.protocolStatus);
   }
 
 
@@ -134,7 +150,7 @@
     StringBuffer buffer = new StringBuffer();
     buffer.append("FetchListEntry: " + fetchListEntry + "Fetch Result:\n" );
     buffer.append("MD5Hash: " + md5Hash + "\n" );
-    buffer.append("Status: " + status + "\n" );
+    buffer.append("ProtocolStatus: " + protocolStatus + "\n" );
     buffer.append("FetchDate: " + new Date(fetchDate) + "\n" );
     return buffer.toString();
   }
Index: src/java/org/apache/nutch/indexer/IndexSegment.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexSegment.java	(revision 179145)
+++ src/java/org/apache/nutch/indexer/IndexSegment.java	(working copy)
@@ -134,7 +134,7 @@
             if (!sr.next(fetcherOutput, null, parseText, parseData)) continue;
 
               // only index the page if it was fetched correctly
-              if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) {
+              if (!fetcherOutput.getProtocolStatus().isSuccess()) {
                   continue;                              
               }
 
Index: src/java/org/apache/nutch/tools/UpdateDatabaseTool.java
===================================================================
--- src/java/org/apache/nutch/tools/UpdateDatabaseTool.java	(revision 179145)
+++ src/java/org/apache/nutch/tools/UpdateDatabaseTool.java	(working copy)
@@ -29,6 +29,7 @@
 import org.apache.nutch.pagedb.*;
 import org.apache.nutch.fetcher.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.*;
 
 
@@ -108,14 +109,14 @@
             if (!fle.getFetch()) {                // didn't fetch
               pageContentsUnchanged(fo);          // treat as unchanged
 
-            } else if (fo.getStatus() == fo.SUCCESS) { // fetch succeed
+            } else if (fo.getProtocolStatus().isSuccess()) { // fetch succeed
               if (fo.getMD5Hash().equals(page.getMD5())) {
                 pageContentsUnchanged(fo);        // contents unchanged
               } else {
                 pageContentsChanged(fo, pd);      // contents changed
               }
 
-            } else if (fo.getStatus() == fo.RETRY &&
+            } else if (fo.getProtocolStatus().getCode() == ProtocolStatus.RETRY &&
                        page.getRetriesSinceFetch() < MAX_RETRIES) {
 
               pageRetry(fo);                      // retry later
Index: src/java/org/apache/nutch/tools/ParseSegment.java
===================================================================
--- src/java/org/apache/nutch/tools/ParseSegment.java	(revision 179145)
+++ src/java/org/apache/nutch/tools/ParseSegment.java	(working copy)
@@ -178,14 +178,15 @@
           // safe guard against mismatched files
           if (!url.equals(content.getUrl())) {
             LOG.severe("Mismatched entries under "
-              + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
+              + FetcherOutput.DIR_NAME_NP + " (" + url +
+              ") and " + Content.DIR_NAME + " (" + content.getUrl() + ")");
             continue;
           }
 
           // if fetch was successful or
           // previously unable to parse (so try again)
-          if (fetcherOutput.getStatus() == FetcherOutput.SUCCESS ||
-              fetcherOutput.getStatus() == FetcherOutput.CANT_PARSE) {
+          ProtocolStatus ps = fetcherOutput.getProtocolStatus();
+          if (ps.isSuccess()) {
             handleContent(url, content);
             synchronized (ParseSegment.this) {
               pages++;                    // record successful parse
@@ -195,18 +196,18 @@
             }
           } else {
             // errored at fetch step
-            logError(url, new ProtocolException("Error at fetch stage"));
-            handleNoContent(ParserOutput.NOFETCH);
+            logError(url, new ProtocolException("Error at fetch stage: " + ps));
+            handleNoContent(new ParseStatus(ParseStatus.FAILED_MISSING_CONTENT));
           }
 
         } catch (ParseException e) {
           logError(url, e);
-          handleNoContent(ParserOutput.FAILURE);
+          handleNoContent(new ParseStatus(e));
 
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, t);
-            handleNoContent(ParserOutput.UNKNOWN);
+            handleNoContent(new ParseStatus(t));
           } else {
             LOG.severe("Unexpected exception");
           }
@@ -238,27 +239,26 @@
       Parse parse = parser.getParse(content);
 
       outputPage
-        (new ParseText(parse.getText()), parse.getData(),ParserOutput.SUCCESS);
+        (new ParseText(parse.getText()), parse.getData());
     }
 
-    private void handleNoContent(int status) {
+    private void handleNoContent(ParseStatus status) {
       if (ParseSegment.this.dryRun) {
         LOG.info("To be handled as no content");
         return;
       }
       outputPage(new ParseText(""),
-                 new ParseData("", new Outlink[0], new Properties()),
-                 status);
+                 new ParseData(status, "", new Outlink[0], new Properties()));
     }
       
     private void outputPage
-      (ParseText parseText, ParseData parseData, int status) {
+      (ParseText parseText, ParseData parseData) {
       try {
         t3 = System.currentTimeMillis();
         synchronized (parserOutputWriter) {
           t4 = System.currentTimeMillis();
           parserOutputWriter.append(new LongWritable(myEntry),
-            new ParserOutput(parseData, parseText, status));
+            new ParserOutput(parseData, parseText));
           t5 = System.currentTimeMillis();
           if (LOG.isLoggable(Level.FINE))
             LOG.fine("Entry: "+myEntry
@@ -274,30 +274,21 @@
   }
 
   /**
-   * Inner class ParserOutput: ParseData + ParseText + status
+   * Inner class ParserOutput: ParseData + ParseText
    */
   private class ParserOutput extends VersionedWritable {
     public static final String DIR_NAME = "parser";
 
-    private final static byte VERSION = 1;
+    private final static byte VERSION = 2;
 
-    // could be more detailed
-    public final static byte UNKNOWN = (byte)0; // unknown problem in parsing
-    public final static byte SUCCESS = (byte)1; // parsing succeeded
-    public final static byte FAILURE = (byte)2; // parsing failed
-    public final static byte NOFETCH = (byte)3; // fetch was not a SUCCESS
-
-    private int status;
-
     private ParseData parseData = new ParseData();
     private ParseText parseText = new ParseText();
 
     public ParserOutput() {}
     
-    public ParserOutput(ParseData parseData, ParseText parseText, int status) {
+    public ParserOutput(ParseData parseData, ParseText parseText) {
       this.parseData = parseData;
       this.parseText = parseText;
-      this.status = status;
     }
 
     public byte getVersion() { return VERSION; }
@@ -310,13 +301,8 @@
       return this.parseText;
     }
 
-    public int getStatus() {
-      return this.status;
-    }
-
     public final void readFields(DataInput in) throws IOException {
       super.readFields(in);                         // check version
-      status = in.readByte();
       parseData.readFields(in);
       parseText.readFields(in);
       return;
@@ -324,7 +310,6 @@
 
     public final void write(DataOutput out) throws IOException {
       super.write(out);                             // write version
-      out.writeByte(status);
       parseData.write(out);
       parseText.write(out);
       return;
@@ -523,19 +508,6 @@
         if (fetcherNPReader.key() != key.get())
           throw new IOException("Mismatch between entries under "
             + FetcherOutput.DIR_NAME_NP + " and in " + sortedFile.getName());
-        // reset status in fo (FetcherOutput), using status in ParserOutput
-        switch (val.getStatus()) {
-        case ParserOutput.SUCCESS:
-          fo.setStatus(FetcherOutput.SUCCESS);
-          break;
-        case ParserOutput.UNKNOWN:
-        case ParserOutput.FAILURE:
-          fo.setStatus(FetcherOutput.CANT_PARSE);
-          break;
-        case ParserOutput.NOFETCH:
-        default:
-          // do not reset
-        }
         fetcherWriter.append(fo);
         parseDataWriter.append(val.getParseData());
         parseTextWriter.append(val.getParseText());
Index: src/java/org/apache/nutch/protocol/Protocol.java
===================================================================
--- src/java/org/apache/nutch/protocol/Protocol.java	(revision 179145)
+++ src/java/org/apache/nutch/protocol/Protocol.java	(working copy)
@@ -18,13 +18,21 @@
 
 import java.io.IOException;
 
+import org.apache.nutch.pagedb.FetchListEntry;
+
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();
 
-  /** Returns the {@link Content} for a url.
+  /** Returns the {@link Content} for a url. This method may be
+   * more limited than {@link #getProtocolOutput(FetchListEntry)}.
    * @throws IOException for any errors.
    */
-  Content getContent(String url) throws ProtocolException;
+  ProtocolOutput getProtocolOutput(String url);
+
+  /** Returns the {@link Content} for a fetchlist entry.
+   * @throws IOException for any errors.
+   */
+  ProtocolOutput getProtocolOutput(FetchListEntry fle);
 }
Index: src/java/org/apache/nutch/parse/ParserChecker.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserChecker.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/ParserChecker.java	(working copy)
@@ -67,7 +67,7 @@
     LOG.info("fetching: "+url);
 
     Protocol protocol = ProtocolFactory.getProtocol(url);
-    Content content = protocol.getContent(url);
+    Content content = protocol.getProtocolOutput(url).getContent();
 
     if (force) {
       content.setContentType(contentType);
Index: src/java/org/apache/nutch/parse/ParseData.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseData.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/ParseData.java	(working copy)
@@ -21,7 +21,6 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 
 
@@ -31,15 +30,17 @@
 public final class ParseData extends VersionedWritable {
   public static final String DIR_NAME = "parse_data";
 
-  private final static byte VERSION = 1;
+  private final static byte VERSION = 2;
 
   private String title;
   private Outlink[] outlinks;
   private Properties metadata;
+  private ParseStatus status;
 
   public ParseData() {}
 
-  public ParseData(String title, Outlink[] outlinks, Properties metadata) {
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+    this.status = status;
     this.title = title;
     this.outlinks = outlinks;
     this.metadata = metadata;
@@ -49,6 +50,9 @@
   // Accessor methods
   //
 
+  /** The status of parsing the page. */
+  public ParseStatus getStatus() { return status; }
+  
   /** The title of the page. */
   public String getTitle() { return title; }
 
@@ -70,8 +74,12 @@
   public byte getVersion() { return VERSION; }
 
   public final void readFields(DataInput in) throws IOException {
-    super.readFields(in);                         // check version
 
+    byte version = in.readByte();
+    if (version > 1)
+      status = ParseStatus.read(in);
+    else
+      status = ParseStatus.STATUS_SUCCESS;
     title = UTF8.readString(in);                   // read title
 
     int totalOutlinks = in.readInt();             // read outlinks
@@ -94,8 +102,8 @@
   }
 
   public final void write(DataOutput out) throws IOException {
-    super.write(out);                             // write version
-
+    out.writeByte(VERSION);                             // write version
+    status.write(out);                       // write status
     UTF8.writeString(out, title);                 // write title
 
     out.writeInt(outlinks.length);                // write outlinks
@@ -127,6 +135,7 @@
       return false;
     ParseData other = (ParseData)o;
     return
+      this.status.equals(other.status) &&
       this.title.equals(other.title) &&
       Arrays.equals(this.outlinks, other.outlinks) &&
       this.metadata.equals(other.metadata);
@@ -135,6 +144,7 @@
   public String toString() {
     StringBuffer buffer = new StringBuffer();
 
+    buffer.append("Status: " + status + "\n" );
     buffer.append("Title: " + title + "\n" );
 
     buffer.append("Outlinks: " + outlinks.length + "\n" );
Index: src/java/org/apache/nutch/parse/HtmlParseFilter.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilter.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/HtmlParseFilter.java	(working copy)
@@ -30,6 +30,5 @@
 
   /** Adds metadata or otherwise modifies a parse of HTML content, given
    * the DOM tree of a page. */
-  Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException;
+  Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
 }
Index: src/java/org/apache/nutch/parse/HtmlParseFilters.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilters.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/HtmlParseFilters.java	(working copy)
@@ -45,11 +45,11 @@
   private  HtmlParseFilters() {}                  // no public ctor
 
   /** Run all defined filters. */
-  public static Parse filter(Content content,Parse parse,DocumentFragment doc)
-    throws ParseException {
+  public static Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
 
     for (int i = 0 ; i < CACHE.length; i++) {
-      parse = CACHE[i].filter(content, parse, doc);
+      parse = CACHE[i].filter(content, parse, metaTags, doc);
+      if (!parse.getData().getStatus().isSuccess()) break;
     }
 
     return parse;
Index: src/java/org/apache/nutch/parse/Parse.java
===================================================================
--- src/java/org/apache/nutch/parse/Parse.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/Parse.java	(working copy)
@@ -20,6 +20,7 @@
  * @see Parser#getParse(FetcherOutput,Content)
  */
 public interface Parse {
+  
   /** The textual content of the page. This is indexed, searched, and used when
    * generating snippets.*/ 
   String getText();
Index: src/java/org/apache/nutch/parse/Parser.java
===================================================================
--- src/java/org/apache/nutch/parse/Parser.java	(revision 179145)
+++ src/java/org/apache/nutch/parse/Parser.java	(working copy)
@@ -27,5 +27,5 @@
   public final static String X_POINT_ID = Parser.class.getName();
 
   /** Creates the parse for some content. */
-  Parse getParse(Content c) throws ParseException;
+  Parse getParse(Content c);
 }
Index: src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java	(revision 179145)
+++ src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java	(working copy)
@@ -64,7 +64,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = ProtocolFactory.getProtocol(urlString);
-      content = protocol.getContent(urlString);
+      content = protocol.getProtocolOutput(urlString).getContent();
 
       parser = ParserFactory.getParser(content.getContentType(), urlString);
       parse = parser.getParse(content);
Index: src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java	(revision 179145)
+++ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java	(working copy)
@@ -27,6 +27,7 @@
 
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -79,13 +80,13 @@
     rootLogger.addAppender(appender);
   }
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (contentType != null && !contentType.startsWith("application/pdf"))
-      throw new ParseException(
-        "Content-Type not application/pdf: "+contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+        "Content-Type not application/pdf: " + contentType).getEmptyParse();
 
     // in memory representation of pdf file
     PDDocument pdf = null;
@@ -100,8 +101,9 @@
       String contentLength = content.get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete pdf file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at "+raw.length
+            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
       }
 
       PDFParser parser = new PDFParser(
@@ -134,14 +136,15 @@
       // formatDate(info.getCreationDate())
       // formatDate(info.getModificationDate())
 
-    } catch (ParseException e) {
-      throw e;
     } catch (CryptographyException e) {
-      throw new ParseException("Error decrypting document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Error decrypting document. " + e).getEmptyParse();
     } catch (InvalidPasswordException e) {
-      throw new ParseException("Can't decrypt document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't decrypt document - invalid password. " + e).getEmptyParse();
     } catch (Exception e) { // run time exception
-      throw new ParseException("Can't be handled as pdf document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as pdf document. " + e).getEmptyParse();
     } finally {
       try {
         if (pdf != null)
@@ -164,7 +167,7 @@
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata()); // copy through
 
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);
Index: src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java	(revision 179145)
+++ src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java	(working copy)
@@ -64,7 +64,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = ProtocolFactory.getProtocol(urlString);
-      content = protocol.getContent(urlString);
+      content = protocol.getProtocolOutput(urlString).getContent();
 
       parser = ParserFactory.getParser(content.getContentType(), urlString);
       parse = parser.getParse(content);
Index: src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java	(revision 179145)
+++ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java	(working copy)
@@ -18,6 +18,7 @@
 
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -52,13 +53,13 @@
 
   public MSWordParser () {}
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (contentType != null && !contentType.startsWith("application/msword"))
-      throw new ParseException(
-        "Content-Type not application/msword: "+contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+        "Content-Type not application/msword: " + contentType).getEmptyParse();
 
     String text = null;
     String title = null;
@@ -71,8 +72,9 @@
       String contentLength = content.get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete msword file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                  "Content truncated at " + raw.length
+            +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
       }
 
       WordExtractor extractor = new WordExtractor();
@@ -86,13 +88,14 @@
       extractor = null;
 
     } catch (ParseException e) {
-      throw e;
+      return new ParseStatus(e).getEmptyParse();
     } catch (FastSavedException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (PasswordProtectedException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (Exception e) { // run time exception
-      throw new ParseException("Can't be handled as msword document. "+e);
+      return new ParseStatus(ParseStatus.FAILED,
+              "Can't be handled as msword document. " + e).getEmptyParse();
     } finally {
       // nothing so far
     }
@@ -116,7 +119,7 @@
     // collect outlink
     Outlink[] outlinks = new Outlink[0];
 
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metadata);
     return new ParseImpl(text, parseData);
     // any filter?
     //return HtmlParseFilters.filter(content, parse, root);
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java	(revision 179145)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java	(working copy)
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 package org.apache.nutch.analysis.lang;
+import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.protocol.Content;
 import org.w3c.dom.*;
 
@@ -38,8 +38,7 @@
    * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
    * <br>Only the first occurence of language is stored.
    */
-  public Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException {
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
     String lang = findLanguage(doc);
 
     if (lang != null) {
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java	(revision 179145)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java	(working copy)
@@ -239,7 +239,7 @@
     Protocol protocol;
     try {
       protocol = ProtocolFactory.getProtocol(url);
-      Content content = protocol.getContent(url);
+      Content content = protocol.getProtocolOutput(url).getContent();
       String contentType = content.getContentType();
       Parser parser = ParserFactory.getParser(contentType, url);
       Parse parse = parser.getParse(content);
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 179145)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -19,22 +19,24 @@
 
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 
+import org.apache.nutch.db.Page;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import java.net.MalformedURLException;
 import java.net.URL;
 
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
 import java.io.IOException;
 
 /************************************
@@ -91,13 +93,13 @@
   }
 
   /** Set the timeout. */
-  public void setTimeout(int timeout) {
-    this.timeout = timeout;
+  public void setTimeout(int to) {
+    timeout = to;
   }
 
   /** Set the point at which content is truncated. */
   public void setMaxContentLength(int length) {
-    this.maxContentLength = length;
+    maxContentLength = length;
   }
 
   /** Set followTalk */
@@ -110,8 +112,19 @@
     this.keepConnection = keepConnection;
   }
 
-  public Content getContent(String urlString) throws FtpException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
     try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
+    try {
       URL url = new URL(urlString);
   
       int redirects = 0;
@@ -123,7 +136,7 @@
         int code = response.getCode();
   
         if (code == 200) {                          // got a good response
-          return response.toContent();              // return it
+          return new ProtocolOutput(response.toContent());              // return it
   
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
@@ -137,8 +150,8 @@
           throw new FtpError(code);
         }
       } 
-    } catch (IOException e) {
-      throw new FtpException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
 
@@ -205,7 +218,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = ftp.getContent(urlString);
+    Content content = ftp.getProtocolOutput(urlString).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));
Index: src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
===================================================================
--- src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java	(revision 179145)
+++ src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java	(working copy)
@@ -23,12 +23,12 @@
 import org.apache.nutch.util.*;
 
 public class TextParser implements Parser {
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
     // copy content meta data through
     Properties metadata = new Properties();
     metadata.putAll(content.getMetadata());
 
-    ParseData parseData = new ParseData("", new Outlink[0], metadata);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new Outlink[0], metadata);
 
     String encoding =
       StringUtil.parseCharacterEncoding(content.getContentType());
@@ -37,7 +37,7 @@
       try {                                       // try to use named encoding
         text = new String(content.getContent(), encoding);
       } catch (java.io.UnsupportedEncodingException e) {
-        throw new ParseException(e);
+        return new ParseStatus(e).getEmptyParse();
       }
     } else {
       // FIXME: implement charset detector. This code causes problem when 
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 179145)
+++ src/plugin/build.xml	(working copy)
@@ -9,12 +9,14 @@
      <ant dir="protocol-file" target="deploy"/>
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
+     <ant dir="protocol-httpclient" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
+     <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-text" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
-<!-- <ant dir="parse-mp3" target="deploy"/>      license: jid3 is LGPL-->
-<!-- <ant dir="parse-rtf" target="deploy"/>      license: parse-rtf is LGPL-->
+<!-- <ant dir="parse-mp3" target="deploy"/> -->
+<!-- <ant dir="parse-rtf" target="deploy"/> -->
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
@@ -38,8 +40,8 @@
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-msword" target="test"/>
-<!-- <ant dir="parse-mp3" target="test"/> -->
-<!-- <ant dir="parse-rtf" target="test"/> -->
+ <!-- <ant dir="parse-mp3" target="test"/> -->
+ <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-ext" target="test"/>
      <ant dir="creativecommons" target="test"/>
      <ant dir="languageidentifier" target="test"/>
@@ -53,7 +55,9 @@
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
+    <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-js" target="clean"/>
     <ant dir="parse-text" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java	(revision 179145)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java	(working copy)
@@ -252,19 +252,22 @@
 
   /** Adds metadata or otherwise modifies a parse of an HTML document, given
    * the DOM tree of a page. */
-  public Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException {
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
 
     // construct base url
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
-    // extract license metadata
-    Walker.walk(doc, base, parse.getData().getMetadata());
+    try {
+      // extract license metadata
+      Walker.walk(doc, base, parse.getData().getMetadata());
+    } catch (ParseException e) {
+      return new ParseStatus(e).getEmptyParse();
+    }
 
     return parse;
   }
Index: src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
===================================================================
--- src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java	(revision 179145)
+++ src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java	(working copy)
@@ -79,7 +79,7 @@
 
     // get nutch content
     Protocol protocol = ProtocolFactory.getProtocol(urlString);
-    content = protocol.getContent(urlString);
+    content = protocol.getProtocolOutput(urlString).getContent();
     protocol = null;
   }
 
Index: src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
===================================================================
--- src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java	(revision 179145)
+++ src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java	(working copy)
@@ -17,15 +17,14 @@
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseException;
 
 import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.CommandRunner;
 
 import org.apache.nutch.plugin.Extension;
@@ -88,14 +87,14 @@
 
   public ExtParser () {}
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     String contentType = content.getContentType();
 
     String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
     if (params == null)
-      throw new ParseException(
-        "No external command defined for contentType: " + contentType);
+      return new ParseStatus(ParseStatus.FAILED,
+                      "No external command defined for contentType: " + contentType).getEmptyParse();
 
     String command = params[0];
     int timeout = Integer.parseInt(params[1]);
@@ -114,8 +113,10 @@
         (String)content.getMetadata().get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete "+contentType+" file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                "Content truncated at " + raw.length
+            +" bytes. Parser can't handle incomplete "
+            + contentType + " file.").getEmptyParse();
       }
 
       ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
@@ -133,15 +134,14 @@
       cr.evaluate();
 
       if (cr.getExitValue() != 0)
-        throw new ParseException("External command "+command
-          +" failed with error: "+es.toString());
+        return new ParseStatus(ParseStatus.FAILED,
+                        "External command " + command
+                        + " failed with error: " + es.toString()).getEmptyParse();
 
       text = os.toString();
 
-    } catch (ParseException e) {
-      throw e;
     } catch (Exception e) { // run time exception
-      throw new ParseException("ExtParser failed. "+e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
     if (text == null)
@@ -157,7 +157,7 @@
     Properties metaData = new Properties();
     metaData.putAll(content.getMetadata()); // copy through
 
-    ParseData parseData = new ParseData(title, outlinks, metaData);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
     return new ParseImpl(text, parseData);
   }
 
Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
===================================================================
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java	(revision 179145)
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java	(working copy)
@@ -18,7 +18,8 @@
 
 import junit.framework.TestCase;
 
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.html.HTMLMetaProcessor.*;
 
 import java.io.ByteArrayInputStream;
 import java.net.URL;
@@ -28,7 +29,7 @@
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
 
-/** Unit tests for RobotsMetaProcessor. */
+/** Unit tests for HTMLMetaProcessor. */
 public class TestRobotsMetaProcessor extends TestCase {
   public TestRobotsMetaProcessor(String name) { 
     super(name); 
@@ -157,8 +158,8 @@
         e.printStackTrace();
       }
 
-      RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator();
-      RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node, 
+      HTMLMetaTags robotsMeta= new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
                                                   currURLsAndAnswers[i][0]);
 
       assertTrue("got index wrong on test " + i,
Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java	(revision 179145)
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java	(working copy)
@@ -205,6 +205,7 @@
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          {
+           new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(revision 179145)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(working copy)
@@ -28,14 +28,11 @@
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.w3c.dom.*;
-import org.w3c.dom.html.*;
 import org.apache.html.dom.*;
 
-import org.apache.nutch.fetcher.FetcherOutput;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.*;
 import org.apache.nutch.parse.*;
-import org.apache.nutch.parse.html.RobotsMetaProcessor.*;
 
 
 public class HtmlParser implements Parser {
@@ -52,6 +49,8 @@
   private static Pattern charsetPattern =
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
                     Pattern.CASE_INSENSITIVE);
+  
+  private static String parserImpl = NutchConf.get().get("parser.html.impl", "neko");
 
   /**
    * Given a <code>byte[]</code> representing an html file of an 
@@ -94,22 +93,14 @@
   private static String defaultCharEncoding =
     NutchConf.get().get("parser.character.encoding.default", "windows-1252");
 
-  public Parse getParse(Content content) throws ParseException {
-    DOMParser parser = new DOMParser();
-    
-    // some plugins, e.g., creativecommons, need to examine html comments
-    try {
-      parser.setFeature("http://apache.org/xml/features/include-comments", 
-                        true);
-    } catch (SAXException e) {}
+  public Parse getParse(Content content) {
+    HTMLMetaTags metaTags = new HTMLMetaTags();
 
-    RobotsMetaIndicator robotsMeta = new RobotsMetaIndicator();
-
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
     String text = "";
@@ -120,19 +111,18 @@
     // check that contentType is one we can handle
     String contentType = content.getContentType();
     if (!"".equals(contentType) && !contentType.startsWith("text/html"))
-      throw new ParseException("Content-Type not text/html: " + contentType);
+      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+              "Content-Type not text/html: " + contentType).getEmptyParse();
     
     // parse the content
     DocumentFragment root;
     try {
       byte[] contentInOctets = content.getContent();
-      InputSource input =
-        new InputSource(new ByteArrayInputStream(contentInOctets));
+      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
       String encoding = StringUtil.parseCharacterEncoding(contentType);
       if (encoding!=null) {
         metadata.put("OriginalCharEncoding", encoding);
         if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
-	  input.setEncoding(encoding); 
           metadata.put("CharEncodingForConversion", encoding);
           LOG.fine(base + ": setting encoding to " + encoding);
         }
@@ -144,7 +134,6 @@
         if (encoding!=null) {
           metadata.put("OriginalCharEncoding", encoding);
           if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
-	    input.setEncoding(encoding); 
             metadata.put("CharEncodingForConversion", encoding);
             LOG.fine(base + ": setting encoding to " + encoding);
           }
@@ -158,33 +147,29 @@
         // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
         // doesn't work for jp because euc-jp and shift_jis have about the
         // same share)
-       
+        encoding = defaultCharEncoding;
         metadata.put("CharEncodingForConversion", defaultCharEncoding);
-        input.setEncoding(defaultCharEncoding);
         LOG.fine(base + ": falling back to " + defaultCharEncoding);
       }
-
+      input.setEncoding(encoding);
       LOG.fine("Parsing...");
-      parser.parse(input);
-
-      // convert Document to DocumentFragment
-      HTMLDocumentImpl doc = (HTMLDocumentImpl)parser.getDocument();
-      doc.setErrorChecking(false);
-      root = doc.createDocumentFragment();
-      root.appendChild(doc.getDocumentElement());
+      root = parse(input);
     } catch (IOException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (DOMException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     } catch (SAXException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
+    } catch (Exception e) {
+      e.printStackTrace();
+      return new ParseStatus(e).getEmptyParse();
     }
       
     // get meta directives
-    RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, root, base);
-      
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    LOG.info("Meta tags for " + base + ": " + metaTags.toString());
     // check meta directives
-    if (!robotsMeta.getNoIndex()) {               // okay to index
+    if (!metaTags.getNoIndex()) {               // okay to index
       StringBuffer sb = new StringBuffer();
       LOG.fine("Getting text...");
       DOMContentUtils.getText(sb, root);          // extract text
@@ -195,7 +180,7 @@
       title = sb.toString().trim();
     }
       
-    if (!robotsMeta.getNoFollow()) {              // okay to follow links
+    if (!metaTags.getNoFollow()) {              // okay to follow links
       ArrayList l = new ArrayList();              // extract outlinks
       URL baseTag = DOMContentUtils.getBase(root);
       LOG.fine("Getting links...");
@@ -204,20 +189,78 @@
       LOG.fine("found "+outlinks.length+" outlinks in "+content.getUrl());
     }
     
-    if (!robotsMeta.getNoCache()) {             // okay to cache
+    if (!metaTags.getNoCache()) {             // okay to cache
       // ??? FIXME ???
     }
     
     // copy content metadata through
     metadata.putAll(content.getMetadata());
-
-    ParseData parseData = new ParseData(title, outlinks, metadata);
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setMessage(metaTags.getRefreshHref().toString());
+    }
+    ParseData parseData = new ParseData(status, title, outlinks, metadata);
     Parse parse = new ParseImpl(text, parseData);
 
     // run filters on parse
-    return HtmlParseFilters.filter(content, parse, root);
+    return HtmlParseFilters.filter(content, parse, metaTags, root);
   }
 
+  private DocumentFragment parse(InputSource input) throws Exception {
+    if (parserImpl.equalsIgnoreCase("tagsoup"))
+      return parseTagSoup(input);
+    else return parseNeko(input);
+  }
+  
+  private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    DocumentFragment frag = doc.createDocumentFragment();
+    DOMBuilder builder = new DOMBuilder(doc, frag);
+    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+    reader.setContentHandler(builder);
+    reader.setFeature(reader.ignoreBogonsFeature, true);
+    reader.setFeature(reader.bogonsEmptyFeature, false);
+    reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+    reader.parse(input);
+    return frag;
+  }
+  
+  private DocumentFragment parseNeko(InputSource input) throws Exception {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    // some plugins, e.g., creativecommons, need to examine html comments
+    try {
+      parser.setFeature("http://apache.org/xml/features/include-comments", 
+              true);
+      parser.setFeature("http://apache.org/xml/features/augmentations", 
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+              false);
+      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors",
+              true);
+    } catch (SAXException e) {}
+    // convert Document to DocumentFragment
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment res = doc.createDocumentFragment();
+    DocumentFragment frag = doc.createDocumentFragment();
+    parser.parse(input, frag);
+    res.appendChild(frag);
+    
+    try {
+      while(true) {
+        frag = doc.createDocumentFragment();
+        parser.parse(input, frag);
+        if (!frag.hasChildNodes()) break;
+        LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
+        res.appendChild(frag);
+      }
+    } catch (Exception x) { x.printStackTrace();};
+    return res;
+  }
+  
   public static void main(String[] args) throws Exception {
     LOG.setLevel(Level.FINE);
     String name = args[0];
Index: src/plugin/parse-html/plugin.xml
===================================================================
--- src/plugin/parse-html/plugin.xml	(revision 179145)
+++ src/plugin/parse-html/plugin.xml	(working copy)
@@ -18,6 +18,7 @@
          <export name="*"/>
       </library>
       <library name="nekohtml-0.9.4.jar"/>
+      <library name="tagsoup-1.0rc2.jar"/>
    </runtime>
 
    <extension id="org.apache.nutch.parse.html"
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java	(revision 179145)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java	(working copy)
@@ -16,7 +16,7 @@
 
 package org.apache.nutch.protocol.http;
 
-import java.io.*;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
@@ -28,6 +28,8 @@
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.db.Page;
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.*;
 
 /** An implementation of the Http protocol. */
@@ -170,8 +172,19 @@
     }
   }
 
-  public Content getContent(String urlString) throws ProtocolException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
     try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
+    try {
       URL url = new URL(urlString);
 
       int redirects = 0;
@@ -191,7 +204,7 @@
         int code = response.getCode();
         
         if (code == 200) {                        // got a good response
-          return response.toContent();            // return it
+          return new ProtocolOutput(response.toContent());            // return it
           
         } else if (code == 410) {                 // page is gone
           throw new ResourceGone(url, "Http: " + code);
@@ -207,8 +220,8 @@
           throw new HttpError(code);
         }
       }
-    } catch (IOException e) {
-      throw new HttpException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     } 
   }
 
@@ -285,7 +298,7 @@
       LOG.setLevel(Level.FINE);
     }
 
-    Content content = http.getContent(url);
+    Content content = http.getProtocolOutput(url).getContent();
 
     System.out.println("Content Type: " + content.getContentType());
     System.out.println("Content Length: " + content.get("Content-Length"));
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 179145)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -17,24 +17,24 @@
 package org.apache.nutch.protocol.file;
 
 
+import org.apache.nutch.db.Page;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
 
+import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import java.net.MalformedURLException;
 import java.net.URL;
 
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
-import java.io.IOException;
-
 /************************************
  * File.java deals with file: scheme.
  *
@@ -65,10 +65,21 @@
   }
 
   /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {this.maxContentLength = length;}
+  public void setMaxContentLength(int length) {maxContentLength = length;}
 
-  public Content getContent(String urlString) throws FileException {
+  public ProtocolOutput getProtocolOutput(String urlString) {
+    ProtocolOutput output = null;
     try {
+      return getProtocolOutput(new FetchListEntry(true,
+            new Page(urlString, 1.0f), new String[0]));
+    } catch (MalformedURLException mue) {
+      return new ProtocolOutput(null, new ProtocolStatus(mue));
+    }
+  }
+  
+  public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+    String urlString = fle.getUrl().toString();
+    try {
       URL url = new URL(urlString);
   
       int redirects = 0;
@@ -80,7 +91,7 @@
         int code = response.getCode();
   
         if (code == 200) {                          // got a good response
-          return response.toContent();              // return it
+          return new ProtocolOutput(response.toContent());              // return it
   
         } else if (code >= 300 && code < 400) {     // handle redirect
           if (redirects == MAX_REDIRECTS)
@@ -94,8 +105,8 @@
           throw new FileError(code);
         }
       } 
-    } catch (IOException e) {
-      throw new FileException(e);
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
 
@@ -139,7 +150,7 @@
     // set log level
     LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getContent(urlString);
+    Content content = file.getProtocolOutput(urlString).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " + content.get("Content-Length"));
Index: build.xml
===================================================================
--- build.xml	(revision 179145)
+++ build.xml	(working copy)
@@ -209,7 +209,9 @@
     	<packageset dir="${plugins.dir}/protocol-file/src/java"/>
     	<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
        	<packageset dir="${plugins.dir}/protocol-http/src/java"/>
+       	<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
     	<packageset dir="${plugins.dir}/parse-html/src/java"/>
+    	<packageset dir="${plugins.dir}/parse-js/src/java"/>
     	<packageset dir="${plugins.dir}/parse-text/src/java"/>
     	<packageset dir="${plugins.dir}/parse-pdf/src/java"/>
 	<packageset dir="${plugins.dir}/parse-rtf/src/java"/>
