Index: src/java/org/apache/nutch/protocol/RobotRulesParser.java
===================================================================
--- src/java/org/apache/nutch/protocol/RobotRulesParser.java (revision 0)
+++ src/java/org/apache/nutch/protocol/RobotRulesParser.java (working copy)
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of robots.txt
files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ *
+ * @author tejasp
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+ public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
+
+ protected static final Hashtable CACHE = new Hashtable ();
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use
+ * when the robots.txt
file is empty or missing;
+ * all requests are allowed.
+ */
+ public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+ /**
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * robots.txt
file is not fetched due to a 403/Forbidden
+ * response; all requests are disallowed.
+ */
+ public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+ private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+ private Configuration conf;
+ protected String agentNames;
+
+ public RobotRulesParser() { }
+
+ public RobotRulesParser(Configuration conf) {
+ setConf(conf);
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ agentNames = conf.get("http.robots.agents");
+ }
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return conf;
+ }
+
+ /**
+ * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons
+ *
+ * @param url A string containing url
+ * @param content Contents of the robots file in a byte array
+ * @param contentType The
+ * @param robotName A string containing value of
+ * @return BaseRobotRules object
+ */
+ public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
+ return robotParser.parseContent(url, content, contentType, robotName);
+ }
+
+ public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+ URL u = null;
+ try {
+ u = new URL(url.toString());
+ } catch (Exception e) {
+ return EMPTY_RULES;
+ }
+ return getRobotRulesSet(protocol, u);
+ }
+
+ public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+ private final static int BUFSIZE= 2048;
+
+ /** command-line main for testing */
+ public static void main(String[] argv) {
+ if (argv.length < 3) {
+ System.out.println("Usage:");
+ System.out.println(" java +");
+ System.out.println("");
+ System.out.println("The will be parsed as a robots.txt file,");
+ System.out.println("using the given to select rules. URLs ");
+ System.out.println("will be read (one per line) from , and tested");
+ System.out.println("against the rules.");
+ System.exit(-1);
+ }
+ try {
+ FileInputStream robotsIn= new FileInputStream(argv[0]);
+ StringBuilder robotNames= new StringBuilder();
+
+ for(int i = 0; i < argv.length - 3; i++)
+ robotNames.append(argv[i+2]).append(", ");
+
+ robotNames.append(argv[argv.length - 3]);
+
+ ArrayList bufs= new ArrayList();
+ byte[] buf= new byte[BUFSIZE];
+ int totBytes= 0;
+
+ int rsize= robotsIn.read(buf);
+ while (rsize >= 0) {
+ totBytes+= rsize;
+ if (rsize != BUFSIZE) {
+ byte[] tmp= new byte[rsize];
+ System.arraycopy(buf, 0, tmp, 0, rsize);
+ bufs.add(tmp);
+ } else {
+ bufs.add(buf);
+ buf= new byte[BUFSIZE];
+ }
+ rsize= robotsIn.read(buf);
+ }
+ robotsIn.close();
+
+ byte[] robotsBytes= new byte[totBytes];
+ int pos= 0;
+
+ for (int i= 0; i < bufs.size(); i++) {
+ byte[] currBuf= (byte[]) bufs.get(i);
+ int currBufLen= currBuf.length;
+ System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
+ pos+= currBufLen;
+ }
+
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", robotNames.toString());
+
+ System.out.println("Rules:");
+ System.out.println(rules);
+ System.out.println();
+
+ LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+ String testPath = testsIn.readLine().trim();
+ while (testPath != null) {
+ System.out.println( (rules.isAllowed(testPath) ?
+ "allowed" : "not allowed")
+ + ":\t" + testPath);
+ testPath = testsIn.readLine();
+ }
+ testsIn.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
+
Index: src/java/org/apache/nutch/protocol/RobotRules.java
===================================================================
--- src/java/org/apache/nutch/protocol/RobotRules.java (revision 1435785)
+++ src/java/org/apache/nutch/protocol/RobotRules.java (working copy)
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol;
-
-import java.net.URL;
-
-/**
- * This class holds the rules which were parsed from a robots.txt file, and can
- * test paths against those rules.
- */
-public interface RobotRules {
- /**
- * Get expire time
- */
- public long getExpireTime();
-
- /**
- * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
- */
- public long getCrawlDelay();
-
- /**
- * Returns false
if the robots.txt
file
- * prohibits us from accessing the given url
, or
- * true
otherwise.
- */
- public boolean isAllowed(URL url);
-
-}
Index: src/java/org/apache/nutch/protocol/Protocol.java
===================================================================
--- src/java/org/apache/nutch/protocol/Protocol.java (revision 1435785)
+++ src/java/org/apache/nutch/protocol/Protocol.java (working copy)
@@ -25,7 +25,9 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.plugin.Pluggable;
+import crawlercommons.robots.BaseRobotRules;
+
/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol extends Pluggable, Configurable {
/** The name of the extension point. */
@@ -59,5 +61,6 @@
* @param datum page datum
* @return robot rules (specific for this url or default), never null
*/
- RobotRules getRobotRules(Text url, CrawlDatum datum);
+ BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
}
+
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java (revision 1435785)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java (working copy)
@@ -51,7 +51,9 @@
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
+import crawlercommons.robots.BaseRobotRules;
+
/**
* A queue-based fetcher.
*
@@ -668,8 +670,8 @@
}
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
- RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
- if (!rules.isAllowed(fit.u)) {
+ BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+ if (!rules.isAllowed(fit.u.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
@@ -1410,3 +1412,4 @@
}
}
+
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (revision 1435785)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (working copy)
@@ -24,30 +24,33 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.hadoop.io.Text;
-import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+import crawlercommons.robots.BaseRobotRules;
+
import java.net.URL;
import java.io.IOException;
-/************************************
- * Ftp.java deals with ftp: scheme.
+/**
+ * This class is a protocol plugin used for ftp: scheme.
+ * It creates {@link FtpResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout},
+ * {@code ftp.server.timeout}, {@code ftp.password},
+ * {@code ftp.keep.connection} and {@code ftp.follow.talk}.
+ * For details see "FTP properties" section in {@code nutch-default.xml}.
*
- * Configurable parameters are defined under "FTP properties" section
- * in ./conf/nutch-default.xml or similar.
- *
* @author John Xing
- ***********************************/
+ */
public class Ftp implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
@@ -106,6 +109,15 @@
this.keepConnection = keepConnection;
}
+ /**
+ * Creates a {@link FtpResponse} object corresponding to the url and
+ * returns a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the ftp url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the url
+ */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -154,11 +166,11 @@
public static void main(String[] args) throws Exception {
int timeout = Integer.MIN_VALUE;
int maxContentLength = Integer.MIN_VALUE;
- String logLevel = "info";
boolean followTalk = false;
boolean keepConnection = false;
boolean dumpContent = false;
String urlString = null;
+ String logLevel;
String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
@@ -216,7 +228,9 @@
ftp = null;
}
-
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
@@ -228,12 +242,20 @@
this.followTalk = conf.getBoolean("ftp.follow.talk", false);
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
- return EmptyRobotRules.RULES;
+ /**
+ * Currently, no robots parsing is done for ftp protocol
+ * and this returns a set of empty rules which will allow every url.
+ * There a jira logged for the same NUTCH-1513
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return RobotRulesParser.EMPTY_RULES;
}
+}
-}
Index: src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
===================================================================
--- src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (revision 1435796)
+++ src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (working copy)
@@ -17,8 +17,7 @@
package org.apache.nutch.protocol.http.api;
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
import junit.framework.TestCase;
public class TestRobotRulesParser extends TestCase {
@@ -264,7 +263,7 @@
}
public void testCrawlDelay() {
- RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
+ HttpRobotRulesParser p = new HttpRobotRulesParser();
String delayRule1 = "User-agent: nutchbot" + CR +
"Crawl-delay: 10" + CR +
"User-agent: foobot" + CR +
@@ -275,12 +274,12 @@
"Crawl-delay: 20" + CR +
"User-agent: *" + CR +
"Disallow:/baz" + CR;
- RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
+ BaseRobotRules rules = p.parseRules("testCrawlDelay", delayRule1.getBytes(), "text/plain", "nutchbot");
long crawlDelay = rules.getCrawlDelay();
assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));
- rules = p.parseRules(delayRule2.getBytes());
+ rules = p.parseRules("testCrawlDelay", delayRule2.getBytes(), "text/plain", "nutchbot");
crawlDelay = rules.getCrawlDelay();
- assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));
+ assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == Long.MIN_VALUE));
}
// helper
@@ -290,19 +289,18 @@
String agentsString= agents[0];
for (int i= 1; i < agents.length; i++)
agentsString= agentsString + "," + agents[i];
- RobotRulesParser p= new RobotRulesParser(agents);
- RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
- ? ROBOTS_STRINGS[robotsString].getBytes()
- : null);
+
+ HttpRobotRulesParser p= new HttpRobotRulesParser();
+ BaseRobotRules rules= p.parseRules("FAKE_URL",
+ ROBOTS_STRINGS[robotsString] != null ? ROBOTS_STRINGS[robotsString].getBytes() : null,
+ "text/plain",
+ agentsString);
+
for (int i= 0; i < paths.length; i++) {
assertTrue("testing robots file "+robotsString+", on agents ("
+ agentsString + "), and path " + TEST_PATHS[i] + "; got "
- + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
- + rules,
- rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+ + rules.isAllowed("http://example.com" + TEST_PATHS[i]) ,
+ rules.isAllowed("http://example.com" + TEST_PATHS[i]) == allowed[i]);
}
- }
-
-
-
+ }
}
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (revision 1435785)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (working copy)
@@ -1,581 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http.api;
-
-// JDK imports
-import java.io.FileInputStream;
-import java.io.FileReader;
-import java.io.LineNumberReader;
-import java.io.IOException;
-import java.net.URL;
-import java.net.URLDecoder;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Hashtable;
-import java.util.StringTokenizer;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-// Nutch imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.RobotRules;
-
-
-/**
- * This class handles the parsing of robots.txt
files.
- * It emits RobotRules objects, which describe the download permissions
- * as described in RobotRulesParser.
- *
- * @author Tom Pierce
- * @author Mike Cafarella
- * @author Doug Cutting
- */
-public class RobotRulesParser implements Configurable {
-
- public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
-
- private boolean allowForbidden = false;
-
- private static final Hashtable CACHE = new Hashtable();
-
- private static final String CHARACTER_ENCODING= "UTF-8";
- private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
-
- private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
-
- private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();
-
- private Configuration conf;
- private HashMap robotNames;
-
- /**
- * This class holds the rules which were parsed from a robots.txt
- * file, and can test paths against those rules.
- */
- public static class RobotRuleSet implements RobotRules {
- ArrayList tmpEntries = new ArrayList();
- RobotsEntry[] entries = null;
- long expireTime;
- long crawlDelay = -1;
-
- /**
- */
- private class RobotsEntry {
- String prefix;
- boolean allowed;
-
- RobotsEntry(String prefix, boolean allowed) {
- this.prefix= prefix;
- this.allowed= allowed;
- }
- }
-
- /**
- */
- private void addPrefix(String prefix, boolean allow) {
- if (tmpEntries == null) {
- tmpEntries= new ArrayList();
- if (entries != null) {
- for (int i= 0; i < entries.length; i++)
- tmpEntries.add(entries[i]);
- }
- entries= null;
- }
-
- tmpEntries.add(new RobotsEntry(prefix, allow));
- }
-
- /**
- */
- private void clearPrefixes() {
- if (tmpEntries == null) {
- tmpEntries= new ArrayList();
- entries= null;
- } else {
- tmpEntries.clear();
- }
- }
-
- /**
- * Change when the ruleset goes stale.
- */
- public void setExpireTime(long expireTime) {
- this.expireTime = expireTime;
- }
-
- /**
- * Get expire time
- */
- public long getExpireTime() {
- return expireTime;
- }
-
- /**
- * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
- */
- public long getCrawlDelay() {
- return crawlDelay;
- }
-
- /**
- * Set Crawl-Delay, in milliseconds
- */
- public void setCrawlDelay(long crawlDelay) {
- this.crawlDelay = crawlDelay;
- }
-
- /**
- * Returns false
if the robots.txt
file
- * prohibits us from accessing the given url
, or
- * true
otherwise.
- */
- public boolean isAllowed(URL url) {
- String path = url.getPath(); // check rules
- if ((path == null) || "".equals(path)) {
- path= "/";
- }
- return isAllowed(path);
- }
-
- /**
- * Returns false
if the robots.txt
file
- * prohibits us from accessing the given path
, or
- * true
otherwise.
- */
- public boolean isAllowed(String path) {
- try {
- path= URLDecoder.decode(path, CHARACTER_ENCODING);
- } catch (Exception e) {
- // just ignore it- we can still try to match
- // path prefixes
- }
-
- if (entries == null) {
- entries= new RobotsEntry[tmpEntries.size()];
- entries= (RobotsEntry[])
- tmpEntries.toArray(entries);
- tmpEntries= null;
- }
-
- int pos= 0;
- int end= entries.length;
- while (pos < end) {
- if (path.startsWith(entries[pos].prefix))
- return entries[pos].allowed;
-
- pos++;
- }
-
- return true;
- }
-
- /**
- */
- public String toString() {
- isAllowed("x"); // force String[] representation
- StringBuffer buf= new StringBuffer();
- for (int i= 0; i < entries.length; i++)
- if (entries[i].allowed)
- buf.append("Allow: " + entries[i].prefix
- + System.getProperty("line.separator"));
- else
- buf.append("Disallow: " + entries[i].prefix
- + System.getProperty("line.separator"));
- return buf.toString();
- }
- }
-
-
- RobotRulesParser() { }
-
- public RobotRulesParser(Configuration conf) {
- setConf(conf);
- }
-
-
- /* ---------------------------------- *
- * *
- * ---------------------------------- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- allowForbidden = conf.getBoolean("http.robots.403.allow", false);
- //
- // Grab the agent names we advertise to robots files.
- //
- String agentName = conf.get("http.agent.name");
- String agentNames = conf.get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList agents = new ArrayList();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- setRobotNames((String[]) agents.toArray(new String[agents.size()]));
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- /* ---------------------------------- *
- * *
- * ---------------------------------- */
-
- private void setRobotNames(String[] robotNames) {
- this.robotNames= new HashMap();
- for (int i= 0; i < robotNames.length; i++) {
- this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
- }
- // always make sure "*" is included
- if (!this.robotNames.containsKey("*"))
- this.robotNames.put("*", new Integer(robotNames.length));
- }
-
- /**
- * Creates a new RobotRulesParser
which will use the
- * supplied robotNames
when choosing which stanza to
- * follow in robots.txt
files. Any name in the array
- * may be matched. The order of the robotNames
- * determines the precedence- if many names are matched, only the
- * rules associated with the robot name having the smallest index
- * will be used.
- */
- RobotRulesParser(String[] robotNames) {
- setRobotNames(robotNames);
- }
-
- /**
- * Returns a {@link RobotRuleSet} object which encapsulates the
- * rules parsed from the supplied robotContent
.
- */
- RobotRuleSet parseRules(byte[] robotContent) {
- if (robotContent == null)
- return EMPTY_RULES;
-
- String content= new String (robotContent);
-
- StringTokenizer lineParser= new StringTokenizer(content, "\n\r");
-
- RobotRuleSet bestRulesSoFar= null;
- int bestPrecedenceSoFar= NO_PRECEDENCE;
-
- RobotRuleSet currentRules= new RobotRuleSet();
- int currentPrecedence= NO_PRECEDENCE;
-
- boolean addRules= false; // in stanza for our robot
- boolean doneAgents= false; // detect multiple agent lines
-
- while (lineParser.hasMoreTokens()) {
- String line= lineParser.nextToken();
-
- // trim out comments and whitespace
- int hashPos= line.indexOf("#");
- if (hashPos >= 0)
- line= line.substring(0, hashPos);
- line= line.trim();
-
- if ( (line.length() >= 11)
- && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
-
- if (doneAgents) {
- if (currentPrecedence < bestPrecedenceSoFar) {
- bestPrecedenceSoFar= currentPrecedence;
- bestRulesSoFar= currentRules;
- currentPrecedence= NO_PRECEDENCE;
- currentRules= new RobotRuleSet();
- }
- addRules= false;
- }
- doneAgents= false;
-
- String agentNames= line.substring(line.indexOf(":") + 1);
- agentNames= agentNames.trim();
- StringTokenizer agentTokenizer= new StringTokenizer(agentNames);
-
- while (agentTokenizer.hasMoreTokens()) {
- // for each agent listed, see if it's us:
- String agentName= agentTokenizer.nextToken().toLowerCase();
-
- Integer precedenceInt= (Integer) robotNames.get(agentName);
-
- if (precedenceInt != null) {
- int precedence= precedenceInt.intValue();
- if ( (precedence < currentPrecedence)
- && (precedence < bestPrecedenceSoFar) )
- currentPrecedence= precedence;
- }
- }
-
- if (currentPrecedence < bestPrecedenceSoFar)
- addRules= true;
-
- } else if ( (line.length() >= 9)
- && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
-
- doneAgents= true;
- String path= line.substring(line.indexOf(":") + 1);
- path= path.trim();
-
- // Skip if no path was specified
- if (path.length() == 0) {
- // Go to the next token
- continue;
- }
- try {
- path= URLDecoder.decode(path, CHARACTER_ENCODING);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("error parsing robots rules- can't decode path: " + path);
- }
- }
-
- if (path.length() == 0) { // "empty rule"
- if (addRules)
- currentRules.clearPrefixes();
- } else { // rule with path
- if (addRules)
- currentRules.addPrefix(path, false);
- }
-
- } else if ( (line.length() >= 6)
- && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
-
- doneAgents= true;
- String path= line.substring(line.indexOf(":") + 1);
- path= path.trim();
-
- if (path.length() == 0) {
- // "empty rule"- treat same as empty disallow
- if (addRules)
- currentRules.clearPrefixes();
- } else { // rule with path
- if (addRules)
- currentRules.addPrefix(path, true);
- }
- } else if ( (line.length() >= 12)
- && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) {
- doneAgents = true;
- if (addRules) {
- long crawlDelay = -1;
- String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
- if (delay.length() > 0) {
- try {
- crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
- } catch (Exception e) {
- LOG.info("can not parse Crawl-Delay:" + e.toString());
- }
- currentRules.setCrawlDelay(crawlDelay);
- }
- }
- }
- }
-
- if (currentPrecedence < bestPrecedenceSoFar) {
- bestPrecedenceSoFar= currentPrecedence;
- bestRulesSoFar= currentRules;
- }
-
- if (bestPrecedenceSoFar == NO_PRECEDENCE)
- return EMPTY_RULES;
- return bestRulesSoFar;
- }
-
- /**
- * Returns a RobotRuleSet
object appropriate for use
- * when the robots.txt
file is empty or missing; all
- * requests are allowed.
- */
- static RobotRuleSet getEmptyRules() {
- return EMPTY_RULES;
- }
-
- /**
- * Returns a RobotRuleSet
object appropriate for use
- * when the robots.txt
file is not fetched due to a
- * 403/Forbidden
response; all requests are
- * disallowed.
- */
- static RobotRuleSet getForbidAllRules() {
- RobotRuleSet rules= new RobotRuleSet();
- rules.addPrefix("", false);
- return rules;
- }
-
- public RobotRuleSet getRobotRulesSet(HttpBase http, Text url) {
- URL u = null;
- try {
- u = new URL(url.toString());
- } catch (Exception e) {
- return EMPTY_RULES;
- }
- return getRobotRulesSet(http, u);
- }
-
- private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {
-
- String host = url.getHost().toLowerCase(); // normalize to lower case
-
- RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
-
- boolean cacheRule = true;
-
- if (robotRules == null) { // cache miss
- URL redir = null;
- if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
- try {
- Response response = http.getResponse(new URL(url, "/robots.txt"),
- new CrawlDatum(), true);
- // try one level of redirection ?
- if (response.getCode() == 301 || response.getCode() == 302) {
- String redirection = response.getHeader("Location");
- if (redirection == null) {
- // some versions of MS IIS are known to mangle this header
- redirection = response.getHeader("location");
- }
- if (redirection != null) {
- if (!redirection.startsWith("http")) {
- // RFC says it should be absolute, but apparently it isn't
- redir = new URL(url, redirection);
- } else {
- redir = new URL(redirection);
- }
-
- response = http.getResponse(redir, new CrawlDatum(), true);
- }
- }
-
- if (response.getCode() == 200) // found rules: parse them
- robotRules = parseRules(response.getContent());
- else if ( (response.getCode() == 403) && (!allowForbidden) )
- robotRules = FORBID_ALL_RULES; // use forbid all
- else if (response.getCode() >= 500) {
- cacheRule = false;
- robotRules = EMPTY_RULES;
- }else
- robotRules = EMPTY_RULES; // use default rules
- } catch (Throwable t) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
- }
- cacheRule = false;
- robotRules = EMPTY_RULES;
- }
-
- if (cacheRule) {
- CACHE.put(host, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equals(host)) {
- // cache also for the redirected host
- CACHE.put(redir.getHost(), robotRules);
- }
- }
- }
- return robotRules;
- }
-
- public boolean isAllowed(HttpBase http, URL url)
- throws ProtocolException, IOException {
- String path = url.getPath(); // check rules
- if ((path == null) || "".equals(path)) {
- path= "/";
- }
-
- return getRobotRulesSet(http, url).isAllowed(path);
- }
-
- public long getCrawlDelay(HttpBase http, URL url)
- throws ProtocolException, IOException {
- return getRobotRulesSet(http, url).getCrawlDelay();
- }
-
- private final static int BUFSIZE= 2048;
-
- /** command-line main for testing */
- public static void main(String[] argv) {
- if (argv.length < 3) {
- System.out.println("Usage:");
- System.out.println(" java +");
- System.out.println("");
- System.out.println("The will be parsed as a robots.txt file,");
- System.out.println("using the given to select rules. URLs ");
- System.out.println("will be read (one per line) from , and tested");
- System.out.println("against the rules.");
- System.exit(-1);
- }
- try {
- FileInputStream robotsIn= new FileInputStream(argv[0]);
- LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
- String[] robotNames= new String[argv.length - 2];
-
- for (int i= 0; i < argv.length - 2; i++)
- robotNames[i]= argv[i+2];
-
- ArrayList bufs= new ArrayList();
- byte[] buf= new byte[BUFSIZE];
- int totBytes= 0;
-
- int rsize= robotsIn.read(buf);
- while (rsize >= 0) {
- totBytes+= rsize;
- if (rsize != BUFSIZE) {
- byte[] tmp= new byte[rsize];
- System.arraycopy(buf, 0, tmp, 0, rsize);
- bufs.add(tmp);
- } else {
- bufs.add(buf);
- buf= new byte[BUFSIZE];
- }
- rsize= robotsIn.read(buf);
- }
-
- byte[] robotsBytes= new byte[totBytes];
- int pos= 0;
-
- for (int i= 0; i < bufs.size(); i++) {
- byte[] currBuf= (byte[]) bufs.get(i);
- int currBufLen= currBuf.length;
- System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
- pos+= currBufLen;
- }
-
- RobotRulesParser parser=
- new RobotRulesParser(robotNames);
- RobotRuleSet rules= parser.parseRules(robotsBytes);
- System.out.println("Rules:");
- System.out.println(rules);
- System.out.println();
-
- String testPath= testsIn.readLine().trim();
- while (testPath != null) {
- System.out.println( (rules.isAllowed(new URL(testPath)) ?
- "allowed" : "not allowed")
- + ":\t" + testPath);
- testPath= testsIn.readLine();
- }
-
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
-}
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (revision 0)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (working copy)
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains
+ * Http protocol specific implementation for obtaining the robots file.
+ *
+ * @author tejasp
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+ public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);
+ protected boolean allowForbidden = false;
+
+ HttpRobotRulesParser() { }
+
+ public HttpRobotRulesParser(Configuration conf) {
+ super(conf);
+ allowForbidden = conf.getBoolean("http.robots.403.allow", false);
+ }
+
+ /**
+ * The hosts for which the caching of robots rules is yet to be done,
+ * it sends a Http request to the host corresponding to the {@link URL}
+ * passed, gets robots file, parses the rules and caches the rules object
+ * to avoid re-work in future.
+ *
+ * @param http The {@link Protocol} object
+ * @param url URL
+ *
+ * @return robotRules A {@link BaseRobotRules} object for the rules
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+
+ BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);
+
+ boolean cacheRule = true;
+
+ if (robotRules == null) { // cache miss
+ URL redir = null;
+ if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+ try {
+ Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),
+ new CrawlDatum(), true);
+ // try one level of redirection ?
+ if (response.getCode() == 301 || response.getCode() == 302) {
+ String redirection = response.getHeader("Location");
+ if (redirection == null) {
+ // some versions of MS IIS are known to mangle this header
+ redirection = response.getHeader("location");
+ }
+ if (redirection != null) {
+ if (!redirection.startsWith("http")) {
+ // RFC says it should be absolute, but apparently it isn't
+ redir = new URL(url, redirection);
+ } else {
+ redir = new URL(redirection);
+ }
+
+ response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true);
+ }
+ }
+
+ if (response.getCode() == 200) // found rules: parse them
+ robotRules = parseRules(url.toString(), response.getContent(),
+ response.getHeader("Content-Type"),
+ agentNames);
+
+ else if ( (response.getCode() == 403) && (!allowForbidden) )
+ robotRules = FORBID_ALL_RULES; // use forbid all
+ else if (response.getCode() >= 500) {
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }else
+ robotRules = EMPTY_RULES; // use default rules
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }
+
+ if (cacheRule) {
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equals(host)) {
+ // cache also for the redirected host
+ CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+ }
+ }
+ }
+ return robotRules;
+ }
+}
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (revision 1435785)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (working copy)
@@ -32,15 +32,16 @@
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.DeflateUtils;
-
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
/**
* @author Jérôme Charron
*/
@@ -51,7 +52,7 @@
private static final byte[] EMPTY_CONTENT = new byte[0];
- private RobotRulesParser robots = null;
+ private HttpRobotRulesParser robots = null;
/** The proxy hostname. */
protected String proxyHost = null;
@@ -105,7 +106,7 @@
if (logger != null) {
this.logger = logger;
}
- robots = new RobotRulesParser();
+ robots = new HttpRobotRulesParser();
}
// Inherited Javadoc
@@ -138,7 +139,6 @@
String urlString = url.toString();
try {
URL u = new URL(urlString);
- String host = null;
Response response = getResponse(u, datum, false); // make a request
int code = response.getCode();
@@ -381,18 +381,17 @@
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
- }
-
+ }
}
-
protected abstract Response getResponse(URL url,
CrawlDatum datum,
boolean followRedirects)
throws ProtocolException, IOException;
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
}
+
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (revision 1435785)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (working copy)
@@ -17,35 +17,33 @@
package org.apache.nutch.protocol.file;
+import java.net.URL;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
-
-import org.apache.hadoop.conf.Configuration;
-
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.util.NutchConfiguration;
-import java.net.URL;
+import crawlercommons.robots.BaseRobotRules;
-/************************************
- * File.java deals with file: scheme.
+/**
+ * This class is a protocol plugin used for file: scheme.
+ * It creates {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and {@code file.crawl.parent}
+ * in nutch-default.xml defined under "file properties" section.
*
- * Configurable parameters are defined under "FILE properties" section
- * in ./conf/nutch-default.xml or similar.
- *
* @author John Xing
- ***********************************/
+ */
public class File implements Protocol {
public static final Logger LOG = LoggerFactory.getLogger(File.class);
@@ -57,13 +55,40 @@
private Configuration conf;
- // constructor
- public File() {
+ public File() {}
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+ this.crawlParents = conf.getBoolean("file.crawl.parent", true);
}
- /** Set the point at which content is truncated. */
- public void setMaxContentLength(int length) {maxContentLength = length;}
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Set the length after at which content is truncated.
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+ /**
+ * Creates a {@link FileResponse} object corresponding to the url and
+ * return a {@link ProtocolOutput} object as per the content received
+ *
+ * @param url Text containing the url
+ * @param datum The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the content of the file indicated by url
+ */
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -99,11 +124,9 @@
}
}
-// protected void finalize () {
-// // nothing here
-// }
-
- /** For debugging. */
+ /**
+ * Quick way for running this class. Useful for debugging.
+ */
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
String logLevel = "info";
@@ -154,17 +177,12 @@
file = null;
}
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
- this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+ /**
+ * No robots parsing is done for file protocol.
+ * So this returns a set of empty rules which will allow every url.
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return RobotRulesParser.EMPTY_RULES;
}
+}
- public Configuration getConf() {
- return this.conf;
- }
-
- public RobotRules getRobotRules(Text url, CrawlDatum datum) {
- return EmptyRobotRules.RULES;
- }
-}