Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 165178)
+++ conf/nutch-default.xml (working copy)
@@ -101,6 +101,15 @@
trying to fetch a page.
+
+
+
+ robot.rules.allow.forbidden
+ false
+ Sites that return a 403 error when accessing robots.txt will
+ still be crawled if this property is set to true
+
+
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (revision 165178)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (working copy)
@@ -50,6 +50,8 @@
public static final Logger LOG=
LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+ private static final boolean ALLOW_FORBIDDEN = NutchConf.get().getBoolean("robot.rules.allow.forbidden", false);
+
private static final String[] AGENTS = getAgents();
private static final Hashtable CACHE = new Hashtable();
@@ -378,7 +380,7 @@
if (response.getCode() == 200) // found rules: parse them
robotRules = new RobotRulesParser().parseRules(response.getContent());
- else if (response.getCode() == 403)
+ else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
robotRules = FORBID_ALL_RULES; // use forbid all
else
robotRules = EMPTY_RULES; // use default rules