Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1595160)
+++ conf/nutch-default.xml (working copy)
@@ -90,11 +90,18 @@
http.robots.agents
- *
- The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence. You should
- put the value of http.agent.name as the first agent name, and keep the
- default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+
+ Any other agents, apart from 'http.agent.name', that the robots
+ parser would look for in robots.txt. Multiple agents can be provided using
+ comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+
+ The ordering of agents does NOT matter and the robots parser would make
+ decision based on the agent which matches first to the robots rules.
+ Also, there is NO need to add a wildcard (ie. "*") to this string as the
+ robots parser would smartly take care of a no-match situation.
+
+ If no value is specified, by default HTTP agent (ie. 'http.agent.name')
+ would be used for user agent matching by the robots parser.
Index: src/java/org/apache/nutch/fetcher/FetcherJob.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherJob.java (revision 1595160)
+++ src/java/org/apache/nutch/fetcher/FetcherJob.java (working copy)
@@ -255,10 +255,7 @@
}
void checkConfiguration() {
-
- // ensure that a value has been set for the agent name and that that
- // agent name is the first value in the agents we advertise for robot
- // rules parsing
+ // ensure that a value has been set for the agent name
String agentName = getConf().get("http.agent.name");
if (agentName == null || agentName.trim().length() == 0) {
String message = "Fetcher: No agents listed in 'http.agent.name'"
@@ -267,23 +264,6 @@
LOG.error(message);
}
throw new IllegalArgumentException(message);
- } else {
-
- // get all of the agents that we advertise
- String agentNames = getConf().get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList agents = new ArrayList();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- // if the first one is not equal to our agent name, log fatal and throw
- // an exception
- if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
- String message = "Fetcher: Your 'http.agent.name' value should be "
- + "listed first in 'http.robots.agents' property.";
- LOG.warn(message);
- }
}
}
Index: src/java/org/apache/nutch/protocol/RobotRulesParser.java
===================================================================
--- src/java/org/apache/nutch/protocol/RobotRulesParser.java (revision 1595160)
+++ src/java/org/apache/nutch/protocol/RobotRulesParser.java (working copy)
@@ -85,43 +85,27 @@
// Grab the agent names we advertise to robots files.
String agentName = conf.get("http.agent.name");
- if (null == agentName) {
+ if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
throw new RuntimeException("Agent name not configured!");
}
+ agentNames = agentName;
- String agentNames = conf.get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList agents = new ArrayList();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- /**
- * If there are no agents for robots-parsing, use the
- * default agent-string. If both are present, our agent-string
- * should be the first one we advertise to robots-parsing.
- */
- if (agents.size() == 0) {
- if (LOG.isErrorEnabled()) {
- LOG.error("No agents listed in 'http.robots.agents' property!");
+ // If there are any other agents specified, append those to the list of agents
+ String otherAgents = conf.get("http.robots.agents");
+ if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+ StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+ StringBuilder sb = new StringBuilder(agentNames);
+ while (tok.hasMoreTokens()) {
+ String str = tok.nextToken().trim();
+ if (str.equals("*") || str.equals(agentName)) {
+ // skip wildcard "*" or agent name itself
+ // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+ } else {
+ sb.append(",").append(str);
+ }
}
- } else {
- StringBuffer combinedAgentsString = new StringBuffer(agentName);
- int index = 0;
- if ((agents.get(0)).equalsIgnoreCase(agentName))
- index++;
- else if (LOG.isErrorEnabled()) {
- LOG.error("Agent we advertise (" + agentName
- + ") not listed first in 'http.robots.agents' property!");
- }
-
- // append all the agents from the http.robots.agents property
- for(; index < agents.size(); index++) {
- combinedAgentsString.append(", " + agents.get(index));
- }
-
- this.agentNames = combinedAgentsString.toString();
+ agentNames = sb.toString();
}
}
@@ -137,8 +121,8 @@
*
* @param url A string containing url
* @param content Contents of the robots file in a byte array
- * @param contentType The
- * @param robotName A string containing value of
+ * @param contentType The content type of the robots file
+ * @param robotName A string containing all the robots agent names used by parser for matching
* @return BaseRobotRules object
*/
public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
@@ -160,23 +144,18 @@
/** command-line main for testing */
public static void main(String[] argv) {
- if (argv.length < 3) {
+ if (argv.length != 3) {
System.err.println("Usage: RobotRulesParser \n");
System.err.println(" - Input robots.txt file which will be parsed.");
System.err.println(" - Contains input URLs (1 per line) which are tested against the rules.");
- System.err.println(" - Input agent name. Multiple agent names can be specified using spaces.");
+ System.err.println(" - Input agent names. Multiple agent names can be provided using");
+ System.err.println(" comma as a delimiter without any spaces.");
System.exit(-1);
}
try {
- StringBuilder agentNames = new StringBuilder();
- for(int counter = 2; counter < argv.length; counter++)
- agentNames.append(argv[counter]).append(",");
-
- agentNames.deleteCharAt(agentNames.length()-1);
-
byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
- BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
String testPath = testsIn.readLine().trim();