Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1354052)
+++ conf/nutch-default.xml (working copy)
@@ -469,6 +469,22 @@
+ db.injector.overwrite
+ false
+ Whether existing records in the CrawlDB will be overwritten
+ by injected records.
+
+
+
+
+ db.injector.preserve.metadata
+ true
+ Whether metadata of the old CrawlDatum will be preserved if
+ db.injector.overwrite is true. Disable this to inject a clean new CrawlDatum.
+
+
+
+
db.score.injected
1.0
The score of new pages added by the injector.
Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java (revision 1354052)
+++ src/java/org/apache/nutch/crawl/Injector.java (working copy)
@@ -152,7 +152,18 @@
/** Combine multiple new entries for a url. */
public static class InjectReducer implements Reducer {
- public void configure(JobConf job) {}
+ private int interval;
+ private float scoreInjected;
+ private boolean overwrite = false;
+ private boolean preserveMetadata = false;
+
+ public void configure(JobConf job) {
+ interval = job.getInt("db.fetch.interval.default", 2592000);
+ scoreInjected = job.getFloat("db.score.injected", 1.0f);
+ overwrite = job.getBoolean("db.injector.overwrite", false);
+ preserveMetadata = job.getBoolean("db.injector.preserve.metadata", true);
+ }
+
public void close() {}
private CrawlDatum old = new CrawlDatum();
@@ -162,19 +173,34 @@
OutputCollector output, Reporter reporter)
throws IOException {
boolean oldSet = false;
+ boolean injectedSet = false;
while (values.hasNext()) {
CrawlDatum val = values.next();
if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
injected.set(val);
injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ injectedSet = true;
} else {
old.set(val);
oldSet = true;
}
}
CrawlDatum res = null;
- if (oldSet) res = old; // don't overwrite existing value
- else res = injected;
+
+ if (injectedSet && oldSet && overwrite) {
+ if (preserveMetadata) {
+ res = old;
+ old.putAllMetaData(injected);
+ old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
+ old.setFetchInterval(injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval());
+ } else {
+ res = injected;
+ }
+ } else if (injectedSet && !oldSet) {
+ res = injected;
+ } else {
+ res = old;
+ }
output.collect(key, res);
}