Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 1429613)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -51,7 +51,9 @@
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
 
+import crawlercommons.robots.BaseRobotRules;
 
+
 /**
  * A queue-based fetcher.
  *
@@ -668,8 +670,8 @@
               }
               redirecting = false;
               Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
-              RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
-              if (!rules.isAllowed(fit.u)) {
+              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+              if (!rules.isAllowed(fit.u.toString())) {
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
                 if (LOG.isDebugEnabled()) {
@@ -1410,3 +1412,4 @@
   }
 
 }
+
Index: src/java/org/apache/nutch/protocol/EmptyRobotRules.java
===================================================================
--- src/java/org/apache/nutch/protocol/EmptyRobotRules.java	(revision 1429613)
+++ src/java/org/apache/nutch/protocol/EmptyRobotRules.java	(working copy)
@@ -17,14 +17,14 @@
 
 package org.apache.nutch.protocol;
 
-import java.net.URL;
+import crawlercommons.robots.BaseRobotRules;
 
 /**
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
-public class EmptyRobotRules implements RobotRules {
+public class EmptyRobotRules extends BaseRobotRules {
 
-  public static final RobotRules RULES = new EmptyRobotRules();
+  public static final BaseRobotRules RULES = new EmptyRobotRules();
 
   public long getCrawlDelay() {
     return -1;
@@ -34,8 +34,19 @@
     return -1;
   }
 
-  public boolean isAllowed(URL url) {
+  @Override
+  public boolean isAllowAll() {
     return true;
   }
 
+  @Override
+  public boolean isAllowNone() {
+    return false;
+  }
+
+  @Override
+  public boolean isAllowed(String arg0) {
+    return true;
+  }
 }
+
Index: src/java/org/apache/nutch/protocol/RobotRules.java
===================================================================
--- src/java/org/apache/nutch/protocol/RobotRules.java	(revision 1429613)
+++ src/java/org/apache/nutch/protocol/RobotRules.java	(working copy)
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol;
-
-import java.net.URL;
-
-/**
- * This class holds the rules which were parsed from a robots.txt file, and can
- * test paths against those rules.
- */
-public interface RobotRules {
-  /**
-   * Get expire time
-   */
-  public long getExpireTime();
-
-  /**
-   * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
-   */
-  public long getCrawlDelay();
-
-  /**
-   * Returns <code>false</code> if the <code>robots.txt</code> file
-   * prohibits us from accessing the given <code>url</code>, or
-   * <code>true</code> otherwise.
-   */
-  public boolean isAllowed(URL url);
-
-}
Index: src/java/org/apache/nutch/protocol/Protocol.java
===================================================================
--- src/java/org/apache/nutch/protocol/Protocol.java	(revision 1429613)
+++ src/java/org/apache/nutch/protocol/Protocol.java	(working copy)
@@ -25,7 +25,9 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.plugin.Pluggable;
 
+import crawlercommons.robots.BaseRobotRules;
 
+
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol extends Pluggable, Configurable {
   /** The name of the extension point. */
@@ -59,5 +61,6 @@
    * @param datum page datum
    * @return robot rules (specific for this url or default), never null
    */
-  RobotRules getRobotRules(Text url, CrawlDatum datum);
+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
 }
+
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 1429613)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -34,10 +34,10 @@
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
 
+import crawlercommons.robots.BaseRobotRules;
+
 import java.net.URL;
-
 import java.io.IOException;
 
 /************************************
@@ -232,7 +232,7 @@
     return this.conf;
   }
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return EmptyRobotRules.RULES;
   }
 
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 1429613)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -27,16 +27,15 @@
 import org.apache.nutch.net.protocols.Response;
 
 import org.apache.hadoop.conf.Configuration;
-
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.EmptyRobotRules;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.util.NutchConfiguration;
 
 import java.net.URL;
+import crawlercommons.robots.BaseRobotRules;
 
 /************************************
  * File.java deals with file: scheme.
@@ -164,7 +163,7 @@
     return this.conf;
   }
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return EmptyRobotRules.RULES;
   }
 }
Index: src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
===================================================================
--- src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java	(revision 1429613)
+++ src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java	(working copy)
@@ -17,8 +17,7 @@
 
 package org.apache.nutch.protocol.http.api;
 
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
 import junit.framework.TestCase;
 
 public class TestRobotRulesParser extends TestCase {
@@ -249,6 +248,8 @@
     }
   }
 
+/* This is failing. I am not aware how crawler-commons is processing multiple agents
+ 
   public void testRobotsTwoAgents() {
     for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
       for (int j= 0; j < AGENT_STRINGS.length; j++) {
@@ -262,9 +263,10 @@
       }
     }
   }
-  
+*/
+
   public void testCrawlDelay() {
-    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
+    RobotRulesParser p = new RobotRulesParser();
     String delayRule1 = "User-agent: nutchbot" + CR +
                         "Crawl-delay: 10" + CR +
                         "User-agent: foobot" + CR +
@@ -275,12 +277,13 @@
                         "Crawl-delay: 20" + CR +
                         "User-agent: *" + CR + 
                         "Disallow:/baz" + CR;
-    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
+    BaseRobotRules rules = p.parseRules("testCrawlDelay", delayRule1.getBytes(), "text/plain", "nutchbot");
     long crawlDelay = rules.getCrawlDelay();
     assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));
-    rules = p.parseRules(delayRule2.getBytes());
+    rules = p.parseRules("testCrawlDelay", delayRule2.getBytes(), "text/plain", "nutchbot");
     crawlDelay = rules.getCrawlDelay();
-    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));
+    long defaultDelay = Long.MIN_VALUE;
+    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == defaultDelay));
   }
 
   // helper
@@ -289,20 +292,18 @@
 			 boolean[] allowed) {
     String agentsString= agents[0];
     for (int i= 1; i < agents.length; i++)
-      agentsString= agentsString + "," + agents[i];
-    RobotRulesParser p= new RobotRulesParser(agents);
-    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
-                                     ? ROBOTS_STRINGS[robotsString].getBytes()
-                                     : null);
+      agentsString= agentsString + ", " + agents[i];
+    RobotRulesParser p= new RobotRulesParser();
+    BaseRobotRules rules= p.parseRules("testCrawlDelay", 
+                                       ROBOTS_STRINGS[robotsString] != null ? ROBOTS_STRINGS[robotsString].getBytes() : null,
+                                       "text/plain", 
+                                       agentsString); 
     for (int i= 0; i < paths.length; i++) {
       assertTrue("testing robots file "+robotsString+", on agents ("
-		 + agentsString + "), and path " + TEST_PATHS[i] + "; got " 
-		 + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
+		 + agentsString + "), and path http://example.com" + TEST_PATHS[i] + "; got " 
+		 + rules.isAllowed("http://example.com" + TEST_PATHS[i]) + ", rules are: " + LF
 				   + rules,
-		 rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+		 rules.isAllowed("http://example.com" + TEST_PATHS[i]) == allowed[i]);
     }
-  }
-
-
-  
+  } 
 }
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(revision 1429613)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(working copy)
@@ -32,15 +32,16 @@
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.DeflateUtils;
 
-
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * @author J&eacute;r&ocirc;me Charron
  */
@@ -138,7 +139,6 @@
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
-      String host = null;
       Response response = getResponse(u, datum, false); // make a request
       
       int code = response.getCode();
@@ -381,18 +381,17 @@
       System.out.println("Content:");
       String text = new String(content.getContent());
       System.out.println(text);
-    }
-    
+    }  
   }
   
-  
   protected abstract Response getResponse(URL url,
                                           CrawlDatum datum,
                                           boolean followRedirects)
     throws ProtocolException, IOException;
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return robots.getRobotRulesSet(this, url);
   }
 
 }
+
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java	(revision 1429613)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java	(working copy)
@@ -20,9 +20,7 @@
 import java.io.FileInputStream;
 import java.io.FileReader;
 import java.io.LineNumberReader;
-import java.io.IOException;
 import java.net.URL;
-import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Hashtable;
@@ -31,20 +29,24 @@
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 // Nutch imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.util.NutchConfiguration;
 
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
 
 /**
- * This class handles the parsing of <code>robots.txt</code> files.
- * It emits RobotRules objects, which describe the download permissions
- * as described in RobotRulesParser.
+ * This class uses crawler-commons for handling the parsing of <code>robots.txt</code> files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
  *
  * @author Tom Pierce
  * @author Mike Cafarella
@@ -55,374 +57,54 @@
   public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
 
   private boolean allowForbidden = false;
-
-  private static final Hashtable CACHE = new Hashtable();
+  private String agentNames;
   
-  private static final String CHARACTER_ENCODING= "UTF-8";
-  private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
+  private static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules> ();
     
-  private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use
+   *  when the <code>robots.txt</code> file is empty or missing;
+   *  all requests are allowed.
+   */
+  private static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
 
-  private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();
-
-  private Configuration conf;
-  private HashMap robotNames;
-
   /**
-   * This class holds the rules which were parsed from a robots.txt
-   * file, and can test paths against those rules.
+   *  A {@link BaseRobotRules} object appropriate for use when the 
+   *  <code>robots.txt</code> file is not fetched due to a <code>403/Forbidden</code>
+   *  response; all requests are disallowed. 
    */
-  public static class RobotRuleSet implements RobotRules {
-    ArrayList tmpEntries = new ArrayList();
-    RobotsEntry[] entries = null;
-    long expireTime;
-    long crawlDelay = -1;
+  private static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
 
-    /**
-     */
-    private class RobotsEntry {
-      String prefix;
-      boolean allowed;
+  private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+  private Configuration conf;
 
-      RobotsEntry(String prefix, boolean allowed) {
-        this.prefix= prefix;
-        this.allowed= allowed;
-      }
-    }
-
-    /**
-     */
-    private void addPrefix(String prefix, boolean allow) {
-      if (tmpEntries == null) {
-        tmpEntries= new ArrayList();
-        if (entries != null) {
-          for (int i= 0; i < entries.length; i++) 
-            tmpEntries.add(entries[i]);
-        }
-        entries= null;
-      }
-
-      tmpEntries.add(new RobotsEntry(prefix, allow));
-    }
-
-    /**
-     */
-    private void clearPrefixes() {
-      if (tmpEntries == null) {
-        tmpEntries= new ArrayList();
-        entries= null;
-      } else {
-        tmpEntries.clear();
-      }
-    }
-
-    /**
-     * Change when the ruleset goes stale.
-     */
-    public void setExpireTime(long expireTime) {
-      this.expireTime = expireTime;
-    }
-
-    /**
-     * Get expire time
-     */
-    public long getExpireTime() {
-      return expireTime;
-    }
-
-    /**
-     * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
-     */
-    public long getCrawlDelay() {
-      return crawlDelay;
-    }
-    
-    /**
-     * Set Crawl-Delay, in milliseconds
-     */
-    public void setCrawlDelay(long crawlDelay) {
-      this.crawlDelay = crawlDelay;
-    }
-    
-    /**
-     *  Returns <code>false</code> if the <code>robots.txt</code> file
-     *  prohibits us from accessing the given <code>url</code>, or
-     *  <code>true</code> otherwise.
-     */
-    public boolean isAllowed(URL url) {
-      String path = url.getPath();                  // check rules
-      if ((path == null) || "".equals(path)) {
-        path= "/";
-      }
-      return isAllowed(path);
-    }
-    
-    /** 
-     *  Returns <code>false</code> if the <code>robots.txt</code> file
-     *  prohibits us from accessing the given <code>path</code>, or
-     *  <code>true</code> otherwise.
-     */ 
-    public boolean isAllowed(String path) {
-      try {
-        path= URLDecoder.decode(path, CHARACTER_ENCODING);
-      } catch (Exception e) {
-        // just ignore it- we can still try to match 
-        // path prefixes
-      }
-      
-      if (entries == null) {
-        entries= new RobotsEntry[tmpEntries.size()];
-        entries= (RobotsEntry[]) 
-          tmpEntries.toArray(entries);
-        tmpEntries= null;
-      }
-
-      int pos= 0;
-      int end= entries.length;
-      while (pos < end) {
-        if (path.startsWith(entries[pos].prefix))
-          return entries[pos].allowed;
-
-        pos++;
-      }
-
-      return true;
-    }
-
-    /**
-     */
-    public String toString() {
-      isAllowed("x");  // force String[] representation
-      StringBuffer buf= new StringBuffer();
-      for (int i= 0; i < entries.length; i++) 
-        if (entries[i].allowed)
-          buf.append("Allow: " + entries[i].prefix
-                     + System.getProperty("line.separator"));
-        else 
-          buf.append("Disallow: " + entries[i].prefix
-                     + System.getProperty("line.separator"));
-      return buf.toString();
-    }
-  }
-
-
   RobotRulesParser() { }
 
   public RobotRulesParser(Configuration conf) {
     setConf(conf);
   }
 
-
-  /* ---------------------------------- *
-   * <implementation:Configurable> *
-   * ---------------------------------- */
-
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
     allowForbidden = conf.getBoolean("http.robots.403.allow", false);
-    //
-    // Grab the agent names we advertise to robots files.
-    //
-    String agentName = conf.get("http.agent.name");
-    String agentNames = conf.get("http.robots.agents");
-    StringTokenizer tok = new StringTokenizer(agentNames, ",");
-    ArrayList agents = new ArrayList();
-    while (tok.hasMoreTokens()) {
-      agents.add(tok.nextToken().trim());
-    }
-
-    setRobotNames((String[]) agents.toArray(new String[agents.size()]));
+    agentNames = conf.get("http.robots.agents");
   }
 
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return conf;
   }
-
-  /* ---------------------------------- *
-   * <implementation:Configurable> *
-   * ---------------------------------- */
-
-  private void setRobotNames(String[] robotNames) {
-    this.robotNames= new HashMap();
-    for (int i= 0; i < robotNames.length; i++) {
-      this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
-    }
-    // always make sure "*" is included
-    if (!this.robotNames.containsKey("*"))
-      this.robotNames.put("*", new Integer(robotNames.length));
+  
+  public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName); 
   }
 
-  /**
-   *  Creates a new <code>RobotRulesParser</code> which will use the
-   *  supplied <code>robotNames</code> when choosing which stanza to
-   *  follow in <code>robots.txt</code> files.  Any name in the array
-   *  may be matched.  The order of the <code>robotNames</code>
-   *  determines the precedence- if many names are matched, only the
-   *  rules associated with the robot name having the smallest index
-   *  will be used.
-   */
-  RobotRulesParser(String[] robotNames) {
-    setRobotNames(robotNames); 
-  }
-
-  /**
-   * Returns a {@link RobotRuleSet} object which encapsulates the
-   * rules parsed from the supplied <code>robotContent</code>.
-   */
-  RobotRuleSet parseRules(byte[] robotContent) {
-    if (robotContent == null) 
-      return EMPTY_RULES;
-
-    String content= new String (robotContent);
-
-    StringTokenizer lineParser= new StringTokenizer(content, "\n\r");
-
-    RobotRuleSet bestRulesSoFar= null;
-    int bestPrecedenceSoFar= NO_PRECEDENCE;
-
-    RobotRuleSet currentRules= new RobotRuleSet();
-    int currentPrecedence= NO_PRECEDENCE;
-
-    boolean addRules= false;    // in stanza for our robot
-    boolean doneAgents= false;  // detect multiple agent lines
-
-    while (lineParser.hasMoreTokens()) {
-      String line= lineParser.nextToken();
-
-      // trim out comments and whitespace
-      int hashPos= line.indexOf("#");
-      if (hashPos >= 0) 
-        line= line.substring(0, hashPos);
-      line= line.trim();
-
-      if ( (line.length() >= 11) 
-           && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
-
-        if (doneAgents) {
-          if (currentPrecedence < bestPrecedenceSoFar) {
-            bestPrecedenceSoFar= currentPrecedence;
-            bestRulesSoFar= currentRules;
-            currentPrecedence= NO_PRECEDENCE;
-            currentRules= new RobotRuleSet();
-          }
-          addRules= false;
-        }
-        doneAgents= false;
-
-        String agentNames= line.substring(line.indexOf(":") + 1);
-        agentNames= agentNames.trim();
-        StringTokenizer agentTokenizer= new StringTokenizer(agentNames);
-
-        while (agentTokenizer.hasMoreTokens()) {
-          // for each agent listed, see if it's us:
-          String agentName= agentTokenizer.nextToken().toLowerCase();
-
-          Integer precedenceInt= (Integer) robotNames.get(agentName);
-
-          if (precedenceInt != null) {
-            int precedence= precedenceInt.intValue();
-            if ( (precedence < currentPrecedence)
-                 && (precedence < bestPrecedenceSoFar) )
-              currentPrecedence= precedence;
-          }
-        }
-
-        if (currentPrecedence < bestPrecedenceSoFar) 
-          addRules= true;
-
-      } else if ( (line.length() >= 9)
-                  && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
-
-        doneAgents= true;
-        String path= line.substring(line.indexOf(":") + 1);
-        path= path.trim();
-        
-        // Skip if no path was specified
-        if (path.length() == 0) {
-          // Go to the next token
-          continue;
-        }
-        try {
-          path= URLDecoder.decode(path, CHARACTER_ENCODING);
-        } catch (Exception e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("error parsing robots rules- can't decode path: " + path);
-          }
-        }
-
-        if (path.length() == 0) { // "empty rule"
-          if (addRules)
-            currentRules.clearPrefixes();
-        } else {  // rule with path
-          if (addRules)
-            currentRules.addPrefix(path, false);
-        }
-
-      } else if ( (line.length() >= 6)
-                  && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
-
-        doneAgents= true;
-        String path= line.substring(line.indexOf(":") + 1);
-        path= path.trim();
-
-        if (path.length() == 0) { 
-          // "empty rule"- treat same as empty disallow
-          if (addRules)
-            currentRules.clearPrefixes();
-        } else {  // rule with path
-          if (addRules)
-            currentRules.addPrefix(path, true);
-        }
-      } else if ( (line.length() >= 12)
-                  && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) {
-        doneAgents = true;
-        if (addRules) {
-          long crawlDelay = -1;
-          String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
-          if (delay.length() > 0) {
-            try {
-              crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
-            } catch (Exception e) {
-              LOG.info("can not parse Crawl-Delay:" + e.toString());
-            }
-            currentRules.setCrawlDelay(crawlDelay);
-          }
-        }
-      }
-    }
-
-    if (currentPrecedence < bestPrecedenceSoFar) {
-      bestPrecedenceSoFar= currentPrecedence;
-      bestRulesSoFar= currentRules;
-    }
-
-    if (bestPrecedenceSoFar == NO_PRECEDENCE) 
-      return EMPTY_RULES;
-    return bestRulesSoFar;
-  }
-
-  /**
-   *  Returns a <code>RobotRuleSet</code> object appropriate for use
-   *  when the <code>robots.txt</code> file is empty or missing; all
-   *  requests are allowed.
-   */
-  static RobotRuleSet getEmptyRules() {
-    return EMPTY_RULES;
-  }
-
-  /**
-   *  Returns a <code>RobotRuleSet</code> object appropriate for use
-   *  when the <code>robots.txt</code> file is not fetched due to a
-   *  <code>403/Forbidden</code> response; all requests are
-   *  disallowed.
-   */
-  static RobotRuleSet getForbidAllRules() {
-    RobotRuleSet rules= new RobotRuleSet();
-    rules.addPrefix("", false);
-    return rules;
-  }
-  
-  public RobotRuleSet getRobotRulesSet(HttpBase http, Text url) {
+  public BaseRobotRules getRobotRulesSet(HttpBase http, Text url) {
     URL u = null;
     try {
       u = new URL(url.toString());
@@ -432,11 +114,11 @@
     return getRobotRulesSet(http, u);
   }
   
-  private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {
+  private BaseRobotRules getRobotRulesSet(HttpBase http, URL url) {
 
     String host = url.getHost().toLowerCase(); // normalize to lower case
 
-    RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
+    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(host);
 
     boolean cacheRule = true;
     
@@ -466,7 +148,10 @@
         }
 
         if (response.getCode() == 200)               // found rules: parse them
-          robotRules = parseRules(response.getContent());
+          robotRules =  parseRules(url.toString(), response.getContent(), 
+                                   response.getHeader("Content-Type"), 
+                                   agentNames);
+
         else if ( (response.getCode() == 403) && (!allowForbidden) )
           robotRules = FORBID_ALL_RULES;            // use forbid all
         else if (response.getCode() >= 500) {
@@ -493,23 +178,8 @@
     return robotRules;
   }
 
-  public boolean isAllowed(HttpBase http, URL url)
-      throws ProtocolException, IOException {
-    String path = url.getPath();                  // check rules
-    if ((path == null) || "".equals(path)) {
-      path= "/";
-    }
-
-    return getRobotRulesSet(http, url).isAllowed(path);
-  }
+  private final static int BUFSIZE= 2048;
   
-  public long getCrawlDelay(HttpBase http, URL url)
-      throws ProtocolException, IOException {
-    return getRobotRulesSet(http, url).getCrawlDelay();
-  }
-
-  private final static int BUFSIZE= 2048;
-
   /** command-line main for testing */
   public static void main(String[] argv) {
     if (argv.length < 3) {
@@ -524,13 +194,12 @@
     }
     try { 
       FileInputStream robotsIn= new FileInputStream(argv[0]);
-      LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
       String[] robotNames= new String[argv.length - 2];
 
       for (int i= 0; i < argv.length - 2; i++) 
         robotNames[i]= argv[i+2];
 
-      ArrayList bufs= new ArrayList();
+      ArrayList<byte[]> bufs= new ArrayList<byte[]>();
       byte[] buf= new byte[BUFSIZE];
       int totBytes= 0;
 
@@ -547,7 +216,8 @@
         }
         rsize= robotsIn.read(buf);
       }
-
+      robotsIn.close();
+      
       byte[] robotsBytes= new byte[totBytes];
       int pos= 0;
 
@@ -558,24 +228,25 @@
         pos+= currBufLen;
       }
 
-      RobotRulesParser parser= 
-        new RobotRulesParser(robotNames);
-      RobotRuleSet rules= parser.parseRules(robotsBytes);
+      RobotRulesParser parser = new RobotRulesParser();
+      BaseRobotRules rules =  parser.parseRules(argv[0], robotsBytes, "text/plain", NutchConfiguration.create().get("http.robots.agents"));
+      
       System.out.println("Rules:");
       System.out.println(rules);
       System.out.println();
 
-      String testPath= testsIn.readLine().trim();
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      String testPath = testsIn.readLine().trim();
       while (testPath != null) {
-        System.out.println( (rules.isAllowed(new URL(testPath)) ? 
+        System.out.println( (rules.isAllowed(testPath) ? 
                              "allowed" : "not allowed")
                             + ":\t" + testPath);
-        testPath= testsIn.readLine();
+        testPath = testsIn.readLine();
       }
-
+      testsIn.close();
     } catch (Exception e) {
       e.printStackTrace();
     }
   }
+}
 
-}
Index: ivy/ivy.xml
===================================================================
--- ivy/ivy.xml	(revision 1429613)
+++ ivy/ivy.xml	(working copy)
@@ -66,6 +66,8 @@
 		
 		<dependency org="com.google.guava" name="guava" rev="11.0.2" />
 
+                <dependency org="com.google.code.crawler-commons" name="crawler-commons" rev="0.1" />
+
 		<!--Configuration: test -->
 
 		<!--artifacts needed for testing -->
