Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1292175)
+++ conf/nutch-default.xml (working copy)
@@ -909,6 +909,14 @@
+
+ parser.skip.truncated
+ true
+ Boolean value for whether we should skip parsing for truncated documents. By default this
+ property is activated due to extremely high levels of CPU which parsing can sometimes take.
+
+
+
Index: src/java/org/apache/nutch/fetcher/FetcherReducer.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherReducer.java (revision 1292175)
+++ src/java/org/apache/nutch/fetcher/FetcherReducer.java (working copy)
@@ -43,6 +43,7 @@
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -81,6 +82,7 @@
private boolean parse;
private ParseUtil parseUtil;
+ private boolean skipTruncated;
/**
* This class described the item to be fetched.
@@ -604,10 +606,14 @@
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
- URLWebPage redirectedPage = parseUtil.process(key, fit.page);
- if (redirectedPage != null) {
- context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
- redirectedPage.getDatum());
+ if (skipTruncated) {
+ if (!ParserJob.isTruncated(fit.url, fit.page)) {
+ URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+ if (redirectedPage != null) {
+ context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+ redirectedPage.getDatum());
+ }
+ }
}
}
context.write(key, fit.page);
@@ -723,6 +729,7 @@
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
if (parse) {
+ skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
parseUtil = new ParseUtil(conf);
}
LOG.info("Fetcher: threads: " + threadCount);
Index: src/java/org/apache/nutch/parse/ParserJob.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserJob.java (revision 1292175)
+++ src/java/org/apache/nutch/parse/ParserJob.java (working copy)
@@ -17,6 +17,7 @@
package org.apache.nutch.parse;
import java.io.IOException;
+import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
@@ -31,6 +32,7 @@
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ParseStatus;
@@ -40,6 +42,7 @@
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
import org.apache.gora.mapreduce.GoraMapper;
@@ -50,6 +53,8 @@
private static final String RESUME_KEY = "parse.job.resume";
private static final String FORCE_KEY = "parse.job.force";
+
+ public static final String SKIP_TRUNCATED = "parser.skip.truncated";
private static final Collection FIELDS = new HashSet();
@@ -64,6 +69,7 @@
FIELDS.add(WebPage.Field.PARSE_STATUS);
FIELDS.add(WebPage.Field.OUTLINKS);
FIELDS.add(WebPage.Field.METADATA);
+ FIELDS.add(WebPage.Field.HEADERS);
}
@@ -76,6 +82,8 @@
private boolean force;
private Utf8 batchId;
+
+ private boolean skipTruncated;
@Override
public void setup(Context context) throws IOException {
@@ -84,6 +92,7 @@
shouldResume = conf.getBoolean(RESUME_KEY, false);
force = conf.getBoolean(FORCE_KEY, false);
batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+ skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
}
@Override
@@ -106,6 +115,13 @@
LOG.info("Parsing " + unreverseKey);
}
+ if (skipTruncated) {
+ if (isTruncated(unreverseKey, page)) {
+ return;
+ }
+ }
+
+
URLWebPage redirectedPage = parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
@@ -128,6 +144,42 @@
public ParserJob(Configuration conf) {
setConf(conf);
}
+
+ /**
+ * Checks if the page's content is truncated.
+ * @param url
+ * @param page
+ * @return If the page is truncated true
. When it is not,
+ * or when it could be determined, false
.
+ */
+ public static boolean isTruncated(String url, WebPage page) {
+ ByteBuffer content = page.getContent();
+ if (content == null) {
+ return false;
+ }
+ Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
+ if (lengthUtf8 == null) {
+ return false;
+ }
+ String lengthStr = lengthUtf8.toString().trim();
+ if (StringUtil.isEmpty(lengthStr)) {
+ return false;
+ }
+ int contentLength;
+ try {
+ contentLength = Integer.parseInt(lengthStr);
+ } catch (NumberFormatException e) {
+ LOG.warn("Wrong contentlength format for " + url, e);
+ return false;
+ }
+ if (contentLength > content.limit()) {
+ LOG.info(url + " skipped. Content of size " + contentLength
+ + " was truncated to " + content.limit());
+ return true;
+ }
+ LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+ return false;
+ }
public Collection getFields(Job job) {
Configuration conf = job.getConfiguration();