Index: build.xml
===================================================================
--- build.xml (revision 4981)
+++ build.xml (working copy)
@@ -213,6 +213,7 @@
+
@@ -656,6 +657,7 @@
+
@@ -1076,6 +1078,8 @@
+
+
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 4981)
+++ conf/nutch-default.xml (working copy)
@@ -518,6 +518,16 @@
+ db.update.purge.orphans
+ false
+ If true, updatedb will permanently delete URL's marked
+ as orphan from the CrawlDb. The plugin scoring-orphan needs to be
+ activated to get records marked as orphan. See the plugin's options
+ elsewhere in this document.
+
+
+
+db.url.normalizersfalseNormalize urls when updating crawldb
@@ -1466,6 +1476,24 @@
+
+
+
+ scoring.orphan.mark.gone.after
+ 2592000
+ Time in seconds after which orphaned
+ pages are marked as gone. Default is 30 days.
+
+
+
+
+ scoring.orphan.mark.orphan.after
+ 3456000
+ Time in seconds after which orphaned
+ pages are marked as gone. Default is 40 days.
+
+
+
Index: default.properties
===================================================================
--- default.properties (revision 4981)
+++ default.properties (working copy)
@@ -125,6 +125,7 @@
org.apache.nutch.scoring.depth*:\
org.apache.nutch.scoring.link*:\
org.apache.nutch.scoring.opic*:\
+ org.apache.nutch.scoring.orphan*:\
org.apache.nutch.scoring.similarity*:\
org.apache.nutch.scoring.tld*:\
org.apache.nutch.scoring.urlmeta*
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java (revision 4981)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java (working copy)
@@ -61,8 +61,11 @@
public static final byte STATUS_DB_REDIR_PERM = 0x05;
/** Page was successfully fetched and found not modified. */
public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+ /** Page was marked as being a duplicate of another page */
public static final byte STATUS_DB_DUPLICATE = 0x07;
-
+ /** Page was marked as orphan, e.g. has no inlinks anymore */
+ public static final byte STATUS_DB_ORPHAN = 0x08;
+
/** Maximum value of DB-related status. */
public static final byte STATUS_DB_MAX = 0x1f;
@@ -100,6 +103,7 @@
statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+ statNames.put(STATUS_DB_ORPHAN, "db_orphan");
statNames.put(STATUS_SIGNATURE, "signature");
statNames.put(STATUS_INJECTED, "injected");
statNames.put(STATUS_LINKED, "linked");
Index: src/java/org/apache/nutch/crawl/CrawlDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDb.java (revision 4981)
+++ src/java/org/apache/nutch/crawl/CrawlDb.java (working copy)
@@ -49,7 +49,8 @@
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
-
+ public static final String CRAWLDB_PURGE_ORPHANS = "db.update.purge.orphans";
+
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
Index: src/java/org/apache/nutch/crawl/CrawlDbFilter.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbFilter.java (revision 4981)
+++ src/java/org/apache/nutch/crawl/CrawlDbFilter.java (working copy)
@@ -38,19 +38,15 @@
public class CrawlDbFilter implements
Mapper {
public static final String URL_FILTERING = "crawldb.url.filters";
-
public static final String URL_NORMALIZING = "crawldb.url.normalizers";
-
public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
private boolean urlFiltering;
-
private boolean urlNormalizers;
private boolean url404Purging;
-
+ private boolean purgeOrphans;
private URLFilters filters;
-
private URLNormalizers normalizers;
private String scope;
@@ -61,7 +57,8 @@
urlFiltering = job.getBoolean(URL_FILTERING, false);
urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
-
+ purgeOrphans = job.getBoolean(CrawlDb.CRAWLDB_PURGE_ORPHANS, false);
+
if (urlFiltering) {
filters = new URLFilters(job);
}
@@ -85,8 +82,17 @@
// https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
// cheaper than normalizing or filtering
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
- url = null;
+ reporter.getCounter("CrawlDB filter",
+ "Gone records removed").increment(1);
+ return;
}
+ // Whether to remove orphaned pages
+ // https://issues.apache.org/jira/browse/NUTCH-1932
+ if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
+ reporter.getCounter("CrawlDB filter",
+ "Orphan records removed").increment(1);
+ return;
+ }
if (url != null && urlNormalizers) {
try {
url = normalizers.normalize(url, scope); // normalize the url
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 4981)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy)
@@ -155,6 +155,15 @@
// still no new data - record only unchanged old data, if exists, and return
if (!fetchSet) {
if (oldSet) {// at this point at least "old" should be present
+ // Need to run updateDbScore here for scoring-orphan or any
+ // scoring plugin that needs to operate on ALL records
+ try {
+ scfilters.updateDbScore(key, null, old, linkList);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't update score, key=" + key + ": " + e);
+ }
+ }
output.collect(key, old);
reporter.getCounter("CrawlDB status",
CrawlDatum.getStatusName(old.getStatus())).increment(1);
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml (revision 4981)
+++ src/plugin/build.xml (working copy)
@@ -67,6 +67,7 @@
+
@@ -119,6 +120,7 @@
+
@@ -188,6 +190,7 @@
+
Index: src/plugin/scoring-orphan/build.xml
===================================================================
--- src/plugin/scoring-orphan/build.xml (revision 0)
+++ src/plugin/scoring-orphan/build.xml (working copy)
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
Index: src/plugin/scoring-orphan/ivy.xml
===================================================================
--- src/plugin/scoring-orphan/ivy.xml (revision 0)
+++ src/plugin/scoring-orphan/ivy.xml (working copy)
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: src/plugin/scoring-orphan/plugin.xml
===================================================================
--- src/plugin/scoring-orphan/plugin.xml (revision 0)
+++ src/plugin/scoring-orphan/plugin.xml (working copy)
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
Index: src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
===================================================================
--- src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java (revision 0)
+++ src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java (working copy)
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * Orphan scoring filter that determines whether a page has become
+ * orphaned, e.g. it has no more other pages linking to it. If a page hasn't
+ * been linked to after markGoneAfter seconds, the page is marked as gone and is
+ * then removed by an indexer. If a page hasn't been linked to after
+ * markOrphanAfter seconds, the page is removed from the CrawlDB.
+ */
+public class OrphanScoringFilter extends Configured
+ implements ScoringFilter {
+ private static final Logger LOG = LoggerFactory.getLogger(OrphanScoringFilter.class);
+
+ public static Text ORPHAN_KEY_WRITABLE = new Text("_orphan_");
+
+ private Configuration conf;
+ private static int DEFAULT_GONE_TIME = 30 * 24 * 60 * 60;
+ private static int DEFAULT_ORPHAN_TIME = 40 * 24 * 60 * 60;
+
+ private long markGoneAfter = DEFAULT_GONE_TIME;
+ private long markOrphanAfter = DEFAULT_ORPHAN_TIME;
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf == null)
+ return;
+
+ markGoneAfter = conf.getInt("scoring.orphan.mark.gone.after",
+ DEFAULT_GONE_TIME);
+ markOrphanAfter = conf.getInt("scoring.orphan.mark.orphan.after",
+ DEFAULT_ORPHAN_TIME);
+ }
+
+ /**
+ * Used for orphan control.
+ *
+ * @param Text url of the record
+ * @param CrawlDatum old CrawlDatum
+ * @param CrawlDatum new CrawlDatum
+ * @param List list of inlinked CrawlDatums
+ * @return void
+ */
+ @Override
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List inlinks) throws ScoringFilterException {
+
+ int now = (int)(System.currentTimeMillis() / 1000);
+
+ // Are there inlinks for this record?
+ if (inlinks.size() > 0) {
+ // Set the last time we have seen this link to NOW
+ datum.getMetaData().put(ORPHAN_KEY_WRITABLE,
+ new IntWritable(now));
+ } else {
+ // Already has an orphaned time?
+ if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) {
+ // Get the last time this hyperlink was inlinked
+ IntWritable writable = (IntWritable)datum.getMetaData()
+ .get(ORPHAN_KEY_WRITABLE);
+ int lastInlinkTime = writable.get();
+
+ // Mark as gone so the indexer can remove it
+ if (now > lastInlinkTime + markGoneAfter) {
+ datum.setStatus(CrawlDatum.STATUS_DB_GONE);
+ }
+
+ // Mark as orphan so we can permanently delete it
+ if (now > lastInlinkTime + markOrphanAfter) {
+ datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN);
+ }
+ }
+ }
+ }
+
+ @Override
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) {
+ return initSort;
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+ Collection> targets, CrawlDatum adjust, int allCount)
+ throws ScoringFilterException {
+ return adjust;
+ }
+
+ @Override
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+ return initScore;
+ }
+
+ @Override
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ datum.setScore(0.0f);
+ }
+
+ @Override
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ @Override
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+ parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
+ content.getMetadata().get(Nutch.SCORE_KEY));
+ }
+
+ @Override
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+ throws ScoringFilterException {
+ content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+ }
+}
\ No newline at end of file
Index: src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
===================================================================
--- src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java (revision 0)
+++ src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java (working copy)
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import java.util.List;
+import java.util.ArrayList;
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestOrphanScoringFilter extends TestCase {
+
+ public void testOrphanScoringFilter() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ conf.setInt("scoring.orphan.mark.gone.after", 5);
+ conf.setInt("scoring.orphan.mark.orphan.after", 10);
+
+ ScoringFilter filter = new OrphanScoringFilter();
+ filter.setConf(conf);
+
+ Text url = new Text("http://nutch.apache.org/");
+ CrawlDatum datum = new CrawlDatum();
+ datum.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
+
+ List emptyListOfInlinks = new ArrayList();
+ List populatedListOfInlinks = new ArrayList();
+ populatedListOfInlinks.add(datum);
+
+ // Act as if record has inlinks
+ filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+ int firstOrphanTime = getTime(datum);
+ assertTrue(datum.getMetaData().containsKey(OrphanScoringFilter.ORPHAN_KEY_WRITABLE));
+
+ // Wait a little bit
+ try {
+ Thread.sleep(1000);
+ } catch (Exception e) { }
+
+ // Again, this time orphan time must be increased by about 1000 ms
+ filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+ int secondOrphanTime = getTime(datum);
+ assertTrue(secondOrphanTime > firstOrphanTime);
+
+ // Act as if no more inlinks, time will not increase, status is still the same
+ filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+ int thirdOrphanTime = getTime(datum);
+ assertEquals(thirdOrphanTime, secondOrphanTime);
+ assertEquals(CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
+
+ // Wait a little bit
+ try {
+ Thread.sleep(1000);
+ } catch (Exception e) { }
+
+ // Act as if no more inlinks, time will not increase, status is still the same
+ filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+ assertEquals(CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
+
+ // Wait until mark.gone.after
+ try {
+ Thread.sleep(5000);
+ } catch (Exception e) { }
+
+ // Again, but now markgoneafter has expired and record should be DB_GONE
+ filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+ int fourthOrphanTime = getTime(datum);
+ assertEquals(fourthOrphanTime, thirdOrphanTime);
+ assertEquals(CrawlDatum.STATUS_DB_GONE, datum.getStatus());
+
+ // Wait until mark.orphan.after
+ try {
+ Thread.sleep(5000);
+ } catch (Exception e) { }
+
+ // Again, but now markgoneafter has expired and record should be DB_ORPHAN
+ filter.updateDbScore(url, null, datum, emptyListOfInlinks);
+ assertEquals(CrawlDatum.STATUS_DB_ORPHAN, datum.getStatus());
+ }
+
+ protected int getTime(CrawlDatum datum) {
+ IntWritable writable = (IntWritable)datum.getMetaData().get(OrphanScoringFilter.ORPHAN_KEY_WRITABLE);
+ return writable.get();
+ }
+}
\ No newline at end of file