Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 1292175)
+++ conf/nutch-default.xml	(working copy)
@@ -909,6 +909,14 @@
   </description>
 </property>
 
+<property>
+  <name>parser.skip.truncated</name>
+  <value>true</value>
+  <description>Boolean value for whether we should skip parsing for truncated documents. By default this 
+  property is activated due to extremely high levels of CPU which parsing can sometimes take.  
+  </description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>
Index: src/java/org/apache/nutch/fetcher/FetcherReducer.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherReducer.java	(revision 1292175)
+++ src/java/org/apache/nutch/fetcher/FetcherReducer.java	(working copy)
@@ -43,6 +43,7 @@
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -81,6 +82,7 @@
   private boolean parse;
 
   private ParseUtil parseUtil;
+  private boolean skipTruncated;
 
   /**
    * This class described the item to be fetched.
@@ -604,10 +606,14 @@
       String key = TableUtil.reverseUrl(fit.url);
 
       if (parse) {
-        URLWebPage redirectedPage = parseUtil.process(key, fit.page);
-        if (redirectedPage != null) {
-          context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
-                        redirectedPage.getDatum());
+        if (skipTruncated) {
+          if (!ParserJob.isTruncated(fit.url, fit.page)) {
+            URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+            if (redirectedPage != null) {
+              context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+                            redirectedPage.getDatum());
+            }
+          }
         }
       }
       context.write(key, fit.page);
@@ -723,6 +729,7 @@
     int threadCount = conf.getInt("fetcher.threads.fetch", 10);
     parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
     if (parse) {
+      skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
       parseUtil = new ParseUtil(conf);
     }
     LOG.info("Fetcher: threads: " + threadCount);
Index: src/java/org/apache/nutch/parse/ParserJob.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserJob.java	(revision 1292175)
+++ src/java/org/apache/nutch/parse/ParserJob.java	(working copy)
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
@@ -31,6 +32,7 @@
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.ParseStatus;
@@ -40,6 +42,7 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.ToolUtil;
 import org.apache.gora.mapreduce.GoraMapper;
@@ -50,6 +53,8 @@
 
   private static final String RESUME_KEY = "parse.job.resume";
   private static final String FORCE_KEY = "parse.job.force";
+  
+  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -64,6 +69,7 @@
     FIELDS.add(WebPage.Field.PARSE_STATUS);
     FIELDS.add(WebPage.Field.OUTLINKS);
     FIELDS.add(WebPage.Field.METADATA);
+    FIELDS.add(WebPage.Field.HEADERS);
   }
 
 
@@ -76,6 +82,8 @@
     private boolean force;
 
     private Utf8 batchId;
+
+    private boolean skipTruncated;
     
     @Override
     public void setup(Context context) throws IOException {
@@ -84,6 +92,7 @@
       shouldResume = conf.getBoolean(RESUME_KEY, false);
       force = conf.getBoolean(FORCE_KEY, false);
       batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+      skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
     }
 
     @Override
@@ -106,6 +115,13 @@
         LOG.info("Parsing " + unreverseKey);
       }
 
+      if (skipTruncated) {
+        if (isTruncated(unreverseKey, page)) {
+          return;
+        }
+      }
+      
+
       URLWebPage redirectedPage = parseUtil.process(key, page);
       ParseStatus pstatus = page.getParseStatus();
       if (pstatus != null) {
@@ -128,6 +144,42 @@
   public ParserJob(Configuration conf) {
     setConf(conf);
   }
+  
+  /**
+   * Checks if the page's content is truncated.
+   * @param url 
+   * @param page
+   * @return If the page is truncated <code>true</code>. When it is not,
+   * or when it could be determined, <code>false</code>. 
+   */
+  public static boolean isTruncated(String url, WebPage page) {
+    ByteBuffer content = page.getContent();
+    if (content == null) {
+      return false;
+    }
+    Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
+    if (lengthUtf8 == null) {
+      return false;
+    }
+    String lengthStr = lengthUtf8.toString().trim();
+    if (StringUtil.isEmpty(lengthStr)) {
+      return false;
+    }
+    int contentLength;
+    try {
+      contentLength = Integer.parseInt(lengthStr);
+    } catch (NumberFormatException e) {
+      LOG.warn("Wrong contentlength format for " + url, e);
+      return false;
+    }
+    if (contentLength > content.limit()) {
+      LOG.info(url + " skipped. Content of size " + contentLength
+          + " was truncated to " + content.limit());
+      return true;
+    }
+    LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+    return false;
+  }
 
   public Collection<WebPage.Field> getFields(Job job) {
     Configuration conf = job.getConfiguration();
