From 3dd213f7de9ee259c49fa7761ad9064a48e25a5c Mon Sep 17 00:00:00 2001
From: Asitang Mishra <asitang@gmail.com>
Date: Mon, 6 Apr 2015 18:29:22 -0700
Subject: [PATCH] NUTCH-1854

---
 src/java/org/apache/nutch/parse/ParseSegment.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index e1360df..67fa915 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -33,6 +33,7 @@ import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
 import java.io.*;
@@ -201,11 +202,12 @@ public class ParseSegment extends Configured implements Tool,
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
+    FileSystem fs = FileSystem.get(getConf());
     if (LOG.isInfoEnabled()) {
       LOG.info("ParseSegment: starting at " + sdf.format(start));
       LOG.info("ParseSegment: segment: " + segment);
     }
-
+    if (!fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))){
     JobConf job = new NutchJob(getConf());
     job.setJobName("parse " + segment);
 
@@ -221,6 +223,10 @@ public class ParseSegment extends Configured implements Tool,
     job.setOutputValueClass(ParseImpl.class);
 
     JobClient.runJob(job);
+  }
+  else{
+    LOG.info("Segment already parsed!! Skipping parsing this segment!!"); //NUTCH-1854
+  }
     long end = System.currentTimeMillis();
     LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
         + TimingUtil.elapsedTime(start, end));
-- 
1.9.5 (Apple Git-50.3)

