Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1703111)
+++ conf/nutch-default.xml (working copy)
@@ -499,6 +499,14 @@
+ db.update.purge.orphans
+ false
+ If true, updatedb will permanently delete URL's marked
+ as orphan from the CrawlDb.
+
+
+
+
db.preserve.backup
true
If true, updatedb will keep a backup of the previous CrawlDB
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java (revision 1703111)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java (working copy)
@@ -61,8 +61,11 @@
public static final byte STATUS_DB_REDIR_PERM = 0x05;
/** Page was successfully fetched and found not modified. */
public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+ /** Page was marked as being a duplicate of another page */
public static final byte STATUS_DB_DUPLICATE = 0x07;
-
+ /** Page was marked as orphan, e.g. has no inlinks anymore */
+ public static final byte STATUS_DB_ORPHAN = 0x08;
+
/** Maximum value of DB-related status. */
public static final byte STATUS_DB_MAX = 0x1f;
@@ -100,6 +103,7 @@
statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+ statNames.put(STATUS_DB_ORPHAN, "db_orphan");
statNames.put(STATUS_SIGNATURE, "signature");
statNames.put(STATUS_INJECTED, "injected");
statNames.put(STATUS_LINKED, "linked");
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 1703111)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy)
@@ -46,6 +46,7 @@
private ScoringFilters scfilters = null;
private boolean additionsAllowed;
private int maxInterval;
+ private boolean purgeOrphans = false;
private FetchSchedule schedule;
public void configure(JobConf job) {
@@ -55,6 +56,7 @@
maxInterval = job.getInt("db.fetch.interval.max", 0);
schedule = FetchScheduleFactory.getFetchSchedule(job);
int maxLinks = job.getInt("db.update.max.inlinks", 10000);
+ purgeOrphans = job.getBoolean("db.update.purge.orphans", false);
linked = new InlinkPriorityQueue(maxLinks);
}
@@ -314,6 +316,14 @@
LOG.warn("Couldn't update score, key=" + key + ": " + e);
}
}
+
+ // Whether to remove orphaned pages
+ if (purgeOrphans && result.getStatus() == CrawlDatum.STATUS_DB_ORPHAN) {
+ reporter.getCounter("CrawlDB status",
+ "Orphans removed").increment(1);
+ return;
+ }
+
// remove generation time, if any
result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
output.collect(key, result);