Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1327031)
+++ conf/nutch-default.xml (working copy)
@@ -867,6 +867,13 @@
+
+ indexer.skip.notmodified
+ false
+ Whether the indexer will skip records with a db_notmodified status.
+
+
+
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java (revision 1327031)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java (working copy)
@@ -55,7 +55,9 @@
public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
public static final String INDEXER_DELETE = "indexer.delete";
+ public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
+ private boolean skip = false;
private boolean delete = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -87,8 +89,15 @@
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum)value;
- if (CrawlDatum.hasDbStatus(datum))
+ if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
+
+ // Whether to skip DB_NOTMODIFIED pages
+ if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+ reporter.incrCounter("IndexerStatus", "Skipped", 1);
+ return;
+ }
+ }
else if (CrawlDatum.hasFetchStatus(datum)) {
// don't index unmodified (empty) pages
@@ -104,14 +113,14 @@
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
- continue;
+ return;
}
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
- continue;
+ return;
}
}
}