Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 1703139)
+++ conf/nutch-default.xml	(working copy)
@@ -499,6 +499,14 @@
 </property>
 
 <property>
+  <name>db.update.purge.orphans</name>
+  <value>false</value>
+  <description>If true, updatedb will permanently delete URL's marked
+  as orphan from the CrawlDb.
+  </description>
+</property>
+
+<property>
   <name>db.preserve.backup</name>
   <value>true</value>
   <description>If true, updatedb will keep a backup of the previous CrawlDB
@@ -1361,6 +1369,24 @@
   </description>
 </property>
 
+<!-- scoring filter orphan properties -->
+
+<property>
+  <name>scoring.orphan.mark.gone.after</name>
+  <value>2592000</value>
+  <description>Time in seconds after which orphaned
+  pages are marked as gone. Default is 30 days.
+  </description>
+</property>
+
+<property>
+  <name>scoring.orphan.mark.orphan.after</name>
+  <value>3456000</value>
+  <description>Time in seconds after which orphaned
+  pages are marked as gone. Default is 40 days.
+  </description>
+</property>
+
 <!-- language-identifier plugin properties -->
 
 <property>
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java	(revision 1703139)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java	(working copy)
@@ -61,8 +61,11 @@
   public static final byte STATUS_DB_REDIR_PERM = 0x05;
   /** Page was successfully fetched and found not modified. */
   public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+  /** Page was marked as being a duplicate of another page */
   public static final byte STATUS_DB_DUPLICATE = 0x07;
-
+  /** Page was marked as orphan, e.g. has no inlinks anymore */
+  public static final byte STATUS_DB_ORPHAN = 0x08;
+  
   /** Maximum value of DB-related status. */
   public static final byte STATUS_DB_MAX = 0x1f;
 
@@ -100,6 +103,7 @@
     statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
     statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
     statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+    statNames.put(STATUS_DB_ORPHAN, "db_orphan");
     statNames.put(STATUS_SIGNATURE, "signature");
     statNames.put(STATUS_INJECTED, "injected");
     statNames.put(STATUS_LINKED, "linked");
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 1703139)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -46,6 +46,7 @@
   private ScoringFilters scfilters = null;
   private boolean additionsAllowed;
   private int maxInterval;
+  private boolean purgeOrphans = false;
   private FetchSchedule schedule;
 
   public void configure(JobConf job) {
@@ -55,6 +56,7 @@
     maxInterval = job.getInt("db.fetch.interval.max", 0);
     schedule = FetchScheduleFactory.getFetchSchedule(job);
     int maxLinks = job.getInt("db.update.max.inlinks", 10000);
+    purgeOrphans = job.getBoolean("db.update.purge.orphans", false);
     linked = new InlinkPriorityQueue(maxLinks);
   }
 
@@ -314,6 +316,14 @@
         LOG.warn("Couldn't update score, key=" + key + ": " + e);
       }
     }
+    
+    // Whether to remove orphaned pages
+    if (purgeOrphans && result.getStatus() == CrawlDatum.STATUS_DB_ORPHAN) {
+      reporter.getCounter("CrawlDB status",
+        "Orphans removed").increment(1);
+      return;
+    }
+    
     // remove generation time, if any
     result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
     output.collect(key, result);
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 1703139)
+++ src/plugin/build.xml	(working copy)
@@ -65,6 +65,7 @@
      <ant dir="scoring-depth" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
+     <ant dir="scoring-orphan" target="deploy"/>
      <ant dir="scoring-similarity" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="tld" target="deploy"/>
@@ -111,6 +112,7 @@
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="parse-zip" target="test"/>
+     <ant dir="scoring-orphan" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test"/>
@@ -174,6 +176,7 @@
     <ant dir="scoring-depth" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
     <ant dir="scoring-link" target="clean"/>
+    <ant dir="scoring-orphan" target="clean"/>
     <ant dir="scoring-similarity" target="clean"/>
     <ant dir="subcollection" target="clean"/>
     <ant dir="tld" target="clean"/>
Index: src/plugin/scoring-orphan/build.xml
===================================================================
--- src/plugin/scoring-orphan/build.xml	(revision 0)
+++ src/plugin/scoring-orphan/build.xml	(working copy)
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-sitesearch" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>
Index: src/plugin/scoring-orphan/ivy.xml
===================================================================
--- src/plugin/scoring-orphan/ivy.xml	(revision 0)
+++ src/plugin/scoring-orphan/ivy.xml	(working copy)
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
Index: src/plugin/scoring-orphan/plugin.xml
===================================================================
--- src/plugin/scoring-orphan/plugin.xml	(revision 0)
+++ src/plugin/scoring-orphan/plugin.xml	(working copy)
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-orphan"
+   name="Orphan Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.cc.org">
+
+   <runtime>
+      <library name="scoring-orphan.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.orphan"
+              name="SitesearchScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.orphan.OrphanScoringFilter"
+        class="org.apache.nutch.scoring.orphan.OrphanScoringFilter" />
+   </extension>
+
+</plugin>
\ No newline at end of file
Index: src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
===================================================================
--- src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java	(revision 0)
+++ src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java	(working copy)
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * Orphan scoring filter that determines whether a page has become
+ * orphaned, e.g. it has no more other pages linking to it.
+ */
+public class OrphanScoringFilter extends Configured
+    implements ScoringFilter {
+  private static final Logger LOG = LoggerFactory.getLogger(OrphanScoringFilter.class);
+  
+  public static Text ORPHAN_KEY_WRITABLE = new Text("_orphan_");
+
+  private Configuration conf;
+  private static long NOW = System.currentTimeMillis();
+  private static LongWritable NOW_WRITABLE = new LongWritable(NOW);
+  private static int DEFAULT_GONE_TIME = 30 * 24 * 60 * 60;
+  private static int DEFAULT_ORPHAN_TIME = 40 * 24 * 60 * 60;
+  
+  private long markGoneAfter = DEFAULT_GONE_TIME * 1000;
+  private long markOrphanAfter = DEFAULT_ORPHAN_TIME * 1000;
+  
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    
+    markGoneAfter = conf.getInt("scoring.orphan.mark.gone.after",
+      DEFAULT_GONE_TIME) * 1000;
+    markOrphanAfter = conf.getInt("scoring.orphan.mark.orphan.after",
+      DEFAULT_GONE_TIME) * 1000;
+  }
+  
+  /**
+   * Used for orphan control.
+   *
+   * @param Text url of the record
+   * @param CrawlDatum old CrawlDatum
+   * @param CrawlDatum new CrawlDatum
+   * @param List<CrawlDatum> list of inlinked CrawlDatums
+   * @return void
+   */
+  @Override
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinks) throws ScoringFilterException {
+      
+    // Are there inlinks for this record?
+    if (inlinks.size() > 0) {
+      datum.getMetaData().put(ORPHAN_KEY_WRITABLE, NOW_WRITABLE);
+    } else {
+      // Already has a set orphaned time?
+      if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) {
+        // Get the last time this hyperlink was inlinked
+        LongWritable writable = (LongWritable)datum.getMetaData()
+            .get(ORPHAN_KEY_WRITABLE);
+        long lastInlinkTime = writable.get();
+        
+        // Mark as gone so the indexer can remove it
+        if (lastInlinkTime < markGoneAfter) {
+          datum.setStatus(CrawlDatum.STATUS_DB_GONE);
+        }
+        
+        // Mark as orphan so we can permanently delete it
+        if (lastInlinkTime < markGoneAfter) {
+          datum.setStatus(CrawlDatum.STATUS_DB_ORPHAN);
+        }
+      }
+    }
+  }
+  
+  @Override
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) {
+    return initSort;
+  }
+  
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+      Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount)
+      throws ScoringFilterException {
+    return adjust;
+  }
+  
+  @Override
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return initScore;
+  }
+
+  @Override
+  public void initialScore(Text url, CrawlDatum datum)
+    throws ScoringFilterException {
+    datum.setScore(0.0f);
+  }
+
+  @Override
+  public void injectedScore(Text url, CrawlDatum datum)
+    throws ScoringFilterException {
+  }
+  
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+    throws ScoringFilterException {
+    parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
+      content.getMetadata().get(Nutch.SCORE_KEY));
+  }
+
+  @Override
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+    throws ScoringFilterException {
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+  }
+}
\ No newline at end of file
Index: src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
===================================================================
--- src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java	(revision 0)
+++ src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java	(working copy)
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.orphan;
+
+import java.util.List;
+import java.util.ArrayList;
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestOrphanScoringFilter extends TestCase {
+
+  public void testPropagateDepthToExternalHost() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    ScoringFilter filter = new OrphanScoringFilter();
+    filter.setConf(conf);
+    
+    Text url = new Text("http://nutch.apache.org/");
+    CrawlDatum datum = new CrawlDatum();
+   
+    List<CrawlDatum> emptyListOfInlinks = new ArrayList<CrawlDatum>();
+    List<CrawlDatum> populatedListOfInlinks = new ArrayList<CrawlDatum>();
+    populatedListOfInlinks.add(datum);
+    
+    // Act as if record has inlinks
+    filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+    long firstOrphanTime = getTime(datum);
+    assertTrue(datum.getMetaData().containsKey(OrphanScoringFilter.ORPHAN_KEY_WRITABLE));
+    
+    // Wait a little bit
+    try {
+      Thread.sleep(1000);
+    } catch (Exception e) { }
+    
+    // Again, this time orphan time must be increased
+    filter.updateDbScore(url, null, datum, populatedListOfInlinks);
+    long secondOrphanTime = getTime(datum);
+    assertTrue(secondOrphanTime > firstOrphanTime);
+  }
+  
+  protected Long getTime(CrawlDatum datum) {
+    LongWritable writable = (LongWritable)datum.getMetaData().get(OrphanScoringFilter.ORPHAN_KEY_WRITABLE);
+    return writable.get();
+  }
+}
\ No newline at end of file
