Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1352784)
+++ conf/nutch-default.xml (working copy)
@@ -469,6 +469,14 @@
+ db.injector.overwrite
+ false
+ Whether existing records in the CrawlDB will be overwritten
+ by injected records.
+
+
+
+
db.score.injected
1.0
The score of new pages added by the injector.
@@ -900,6 +908,12 @@
+ indexer.add.domain
+ false
+ Whether to add the domain field to a NutchDocument.
+
+
+
indexer.skip.notmodified
false
Whether the indexer will skip records with a db_notmodified status.
Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java (revision 1352784)
+++ src/java/org/apache/nutch/crawl/Injector.java (working copy)
@@ -152,7 +152,11 @@
/** Combine multiple new entries for a url. */
public static class InjectReducer implements Reducer {
- public void configure(JobConf job) {}
+ private boolean overwrite = false;
+ public void configure(JobConf job) {
+ overwrite = job.getBoolean("db.injector.overwrite", false);
+ }
+
public void close() {}
private CrawlDatum old = new CrawlDatum();
@@ -162,19 +166,25 @@
OutputCollector output, Reporter reporter)
throws IOException {
boolean oldSet = false;
+ boolean injectedSet = false;
while (values.hasNext()) {
CrawlDatum val = values.next();
if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
injected.set(val);
injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ injectedSet = true;
} else {
old.set(val);
oldSet = true;
}
}
CrawlDatum res = null;
- if (oldSet) res = old; // don't overwrite existing value
- else res = injected;
+
+ if ((injectedSet && overwrite) || (injectedSet && !oldSet)) {
+ res = injected;
+ } else {
+ res = old;
+ }
output.collect(key, res);
}