Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 417285)
+++ conf/nutch-default.xml	(working copy)
@@ -212,11 +212,82 @@
 <property>
   <name>db.default.fetch.interval</name>
   <value>30</value>
-  <description>The default number of days between re-fetches of a page.
+  <description>(DEPRECATED) The default number of days between re-fetches of a page.
   </description>
 </property>
 
 <property>
+  <name>db.fetch.interval.default</name>
+  <value>2592000</value>
+  <description>The default number of seconds between re-fetches of a page
+  (30 days).
+  </description>
+</property>
+
+<property>
+  <name>db.fetch.interval.max</name>
+  <value>7776000</value>
+  <description>The maximum number of seconds between re-fetches of a page
+  (90 days). After this period every page in the db will be re-tried, no
+  matter what is its status.
+  </description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.class</name>
+  <value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
+  <description>The default implementation of fetch schedule. It simply
+  adds the original fetchInterval to the last fetch time, regardless of
+  page chages.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.inc_rate</name>
+  <value>0.4</value>
+  <description>If a page is unmodified, its fetchInterval will be
+  increased by this rate. This value should not
+  exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.dec_rate</name>
+  <value>0.2</value>
+  <description>If a page is modified, its fetchInterval will be
+  decreased by this rate. This value should not
+  exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.min_interval</name>
+  <value>60.0</value>
+  <description>Minimum fetchInterval, in seconds.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.max_interval</name>
+  <value>31536000.0</value>
+  <description>Maximum fetchInterval, in seconds (365 days).
+  NOTE: this is limited by db.fetch.interval.max. Pages with
+  fetchInterval larger than db.fetch.interval.max
+  will be fetched anyway.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.sync_delta</name>
+  <value>true</value>
+  <description>If true, try to synchronize with the time of page change.
+  by shifting the next fetchTime by a fraction (sync_rate) of the difference
+  between the last modification time, and the last fetch time.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.sync_delta_rate</name>
+  <value>0.3</value>
+  <description>See sync_delta for description. This value should not
+  exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 417282)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -191,7 +191,6 @@
               case ProtocolStatus.EXCEPTION:
                 logError(url, status.getMessage());
               case ProtocolStatus.RETRY:          // retry
-                datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
                 output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
                 break;
                 
@@ -199,10 +198,13 @@
               case ProtocolStatus.NOTFOUND:
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
-              case ProtocolStatus.NOTMODIFIED:
                 output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
                 break;
 
+              case ProtocolStatus.NOTMODIFIED:
+                output(url, datum, null, CrawlDatum.STATUS_FETCH_UNMODIFIED);
+                break;
+
               default:
                 if (LOG.isWarnEnabled()) {
                   LOG.warn("Unknown ProtocolStatus: " + status.getCode());
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java	(revision 417282)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java	(working copy)
@@ -32,6 +32,7 @@
 
   private final static byte CUR_VERSION = 4;
 
+  public static final byte STATUS_INJECTED = -1;
   public static final byte STATUS_SIGNATURE = 0;
   public static final byte STATUS_DB_UNFETCHED = 1;
   public static final byte STATUS_DB_FETCHED = 2;
@@ -40,6 +41,7 @@
   public static final byte STATUS_FETCH_SUCCESS = 5;
   public static final byte STATUS_FETCH_RETRY = 6;
   public static final byte STATUS_FETCH_GONE = 7;
+  public static final byte STATUS_FETCH_UNMODIFIED = 8;
   
   public static final String[] statNames = {
     "signature",
@@ -49,18 +51,17 @@
     "linked",
     "fetch_success",
     "fetch_retry",
-    "fetch_gone"
+    "fetch_gone",
+    "fetch_unmodified"
   };
 
-  private static final float MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;
-
   private byte status;
   private long fetchTime = System.currentTimeMillis();
   private byte retries;
   private float fetchInterval;
   private float score = 1.0f;
   private byte[] signature = null;
-  private long modifiedTime;
+  private long modifiedTime = 0L;
   private MapWritable metaData;
 
   public CrawlDatum() {}
@@ -85,10 +86,6 @@
   public long getFetchTime() { return fetchTime; }
   public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
 
-  public void setNextFetchTime() {
-    fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);
-  }
-
   public long getModifiedTime() {
     return modifiedTime;
   }
@@ -288,7 +285,8 @@
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
     buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
-    buf.append("Retry interval: " + getFetchInterval() + " days\n");
+    buf.append("Retry interval: " + getFetchInterval() + " seconds (" +
+            (getFetchInterval() / 24.0f / 3600.0f) + " days)\n");
     buf.append("Score: " + getScore() + "\n");
     buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
     buf.append("Metadata: " + (metaData != null ? metaData.toString() : "null") + "\n");
Index: src/java/org/apache/nutch/crawl/FetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
@@ -0,0 +1,198 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This abstract class defines the contract for implementations that manipulate
+ * fetch times and re-fetch intervals.
+ * 
+ * @author Andrzej Bialecki
+ */
+public abstract class FetchSchedule extends Configured {
+  private static final Log LOG = LogFactory.getLog(FetchSchedule.class);
+  
+  /** It is unknown whether page was changed since our last visit. */
+  public static final int STATUS_UNKNOWN       = 0;
+  /** Page is known to have been modified since our last visit. */
+  public static final int STATUS_MODIFIED      = 1;
+  /** Page is known to remain unmodified since our last visit. */
+  public static final int STATUS_UNMODIFIED    = 2;
+  
+  public static final float SECONDS_PER_DAY = 3600.0f * 24.0f;
+
+  private float defaultInterval;
+  private float maxInterval;
+  
+  public FetchSchedule() {
+    super(null);
+  }
+  
+  public FetchSchedule(Configuration conf) {
+    super(conf);
+  }
+  
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null) return;
+    int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);
+    defaultInterval = conf.getFloat("db.fetch.interval.default", 30.0f * SECONDS_PER_DAY);
+    if (oldDefaultInterval != 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY;
+    maxInterval = conf.getFloat("db.fetch.interval.max", 30.0f * SECONDS_PER_DAY);
+    LOG.info("defaultInterval=" + defaultInterval);
+    LOG.info("maxInterval=" + maxInterval);
+  }
+  
+  /**
+   * Initialize fetch schedule related data. Implementations should at least
+   * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+   * implementation set the <code>fetchTime</code> to now, using the
+   * default <code>fetchInterval</code>.
+   * 
+   * @param url URL of the page.
+   * @param datum datum instance to be initialized (modified in place).
+   */
+  public void initializeSchedule(UTF8 url, CrawlDatum datum) {
+    datum.setFetchTime(System.currentTimeMillis());
+    datum.setFetchInterval(defaultInterval);
+  }
+  
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page.
+   * Implementations may use supplied arguments to support different re-fetching
+   * schedules.
+   * 
+   * @param url url of the page
+   * @param datum page description to be adjusted. NOTE: this instance, passed by reference,
+   * may be modified inside the method.
+   * @param prevFetchTime previous value of fetch time, or 0 if not available
+   * @param prevModifiedTime previous value of modifiedTime, or 0 if not available
+   * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule
+   * implementations should update the value in {@param datum} to something greater than this value.
+   * @param modifiedTime last time the content was modified. This information comes from
+   * the protocol implementations, or is set to < 0 if not available. Most FetchSchedule
+   * implementations should update the value in {@param datum} to this value.
+   * @param state if {@link #STATUS_MODIFIED}, then the content is considered to be "changed" before the
+   * <code>fetchTime</code>, if {@link #STATUS_UNMODIFIED} then the content is known to be unchanged.
+   * This information may be obtained by comparing page signatures before and after fetching. If this
+   * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page was changed; implementations
+   * are free to follow a sensible default behavior.
+   * @return adjusted page information, including all original information. NOTE: this may
+   * be a different instance than {@param datum}, but implementations should make sure that
+   * it contains at least all information from {@param datum}.
+   */
+  public abstract CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime,
+          long fetchTime, long modifiedTime, int state);
+  
+  /**
+   * This method specifies how to schedule refetching of pages
+   * marked as GONE. Default implementation increases fetchInterval by 50%,
+   * and if it exceeds the <code>maxInterval</code> it calls
+   * {@link #forceRefetch(UTF8, CrawlDatum, boolean)}.
+   * @param url URL of the page
+   * @param datum datum instance to be adjusted
+   * @return adjusted page information, including all original information.
+   * NOTE: this may be a different instance than {@param datum}, but
+   * implementations should make sure that it contains at least all
+   * information from {@param datum}.
+   */
+  public CrawlDatum setPageGoneSchedule(UTF8 url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    // no page is truly GONE ... just increase the interval by 50%
+    // and try much later.
+    datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+    if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
+    return datum;
+  }
+  
+  /**
+   * This method adjusts the fetch schedule if fetching needs to be
+   * re-tried due to transient errors. The default implementation
+   * sets the next fetch time 1 day in the future.
+   * @param url URL of the page
+   * @param datum page information
+   * @param prevFetchTime previous fetch time
+   * @param prevModifiedTime previous modified time
+   * @param fetchTime current fetch time
+   * @return adjusted page information, including all original information.
+   * NOTE: this may be a different instance than {@param datum}, but
+   * implementations should make sure that it contains at least all
+   * information from {@param datum}.
+   */
+  public CrawlDatum setPageRetrySchedule(UTF8 url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    datum.setFetchTime(fetchTime + (int)SECONDS_PER_DAY);
+    return datum;
+  }
+  
+  /**
+   * This method provides information whether the page is suitable for
+   * selection in the current fetchlist. NOTE: a true return value does not
+   * guarantee that the page will be fetched, it just allows it to be
+   * included in the further selection process based on scores. The default
+   * implementation checks <code>fetchTime</code>, if it is higher than the
+   * {@param curTime} it returns false, and true otherwise. It will also
+   * check that fetchTime is not too remote (more than <code>maxInterval</code),
+   * in which case it lowers the interval and returns true.
+   * @param url URL of the page
+   * @param datum datum instance
+   * @param curTime reference time (usually set to the time when the
+   * fetchlist generation process was started).
+   * @return true, if the page should be considered for inclusion in the current
+   * fetchlist, otherwise false.
+   */
+  public boolean shouldFetch(UTF8 url, CrawlDatum datum, long curTime) {
+    // pages are never truly GONE - we have to check them from time to time.
+    // pages with too long fetchInterval are adjusted so that they fit within
+    // maximum fetchInterval (segment retention period).
+    if (datum.getFetchTime() - curTime > maxInterval) {
+      datum.setFetchInterval(maxInterval * 0.9f);
+      datum.setFetchTime(curTime);
+    }
+    if (datum.getFetchTime() > curTime) {
+      return false;                                   // not time yet
+    }
+    return true;
+  }
+  
+  /**
+   * This method resets fetchTime, fetchInterval, modifiedTime and
+   * page signature, so that it forces refetching.
+   * @param url URL of the page
+   * @param datum datum instance
+   * @param asap if true, force refetch as soon as possible - this sets
+   * the fetchTime to now. If false, force refetch whenever the next fetch
+   * time is set.
+   */
+  public void  forceRefetch(UTF8 url, CrawlDatum datum, boolean asap) {
+    // reduce fetchInterval so that it fits within the max value
+    if (datum.getFetchInterval() > maxInterval)
+      datum.setFetchInterval(maxInterval * 0.9f);
+    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    datum.setSignature(null);
+    datum.setModifiedTime(0L);
+    if (asap) datum.setFetchTime(System.currentTimeMillis());
+  }
+
+}

Property changes on: src/java/org/apache/nutch/crawl/FetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.UTF8;
+
+/**
+ * This class implements the default re-fetch schedule. That is, no matter
+ * if the page was changed or not, the <code>fetchInterval</code> remains
+ * unchanged, and the updated page fetchTime will always be set to
+ * <code>fetchTime + fetchInterval * 1000</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class DefaultFetchSchedule extends FetchSchedule {
+
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime,
+          long fetchTime, long modifiedTime, int state) {
+    datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0f));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+}

Property changes on: src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java	(revision 417282)
+++ src/java/org/apache/nutch/crawl/Injector.java	(working copy)
@@ -47,7 +47,7 @@
     private float scoreInjected;
     private JobConf jobConf;
     private URLFilters filters;
-    private ScoringFilters scfilters; 
+    private ScoringFilters scfilters;
 
     public void configure(JobConf job) {
       this.jobConf = job;
@@ -93,13 +93,87 @@
 
   /** Combine multiple new entries for a url. */
   public static class InjectReducer implements Reducer {
-    public void configure(JobConf job) {}
+    private static final int RESET_NONE     = 0x0000;
+    private static final int RESET_SCORE    = 0x0001;
+    private static final int RESET_SCHEDULE = 0x0002;
+    private static final int RESET_METADATA = 0x0004;
+    private static final int RESET_ALL      = 0x00ff;
+    
+    private static final int[] masks = {
+      RESET_NONE,
+      RESET_SCORE,
+      RESET_SCHEDULE,
+      RESET_METADATA,
+      RESET_ALL
+    };
+    private static final String[] maskNames = {
+      "none",
+      "score",
+      "schedule",
+      "metadata",
+      "all"
+    };
+    
+    private CrawlDatum injected, existing;
+    private int resetMode;
+    private FetchSchedule schedule;
+    private ScoringFilters scfilters;
+    private float scoreInjected;
+    
+    public void configure(JobConf job) {
+      String mode = job.get("db.injected.reset.mask", "none");
+      List names = Arrays.asList(mode.toLowerCase().split("\\s+"));
+      resetMode = RESET_NONE;
+      for (int i = 0; i < maskNames.length; i++) {
+        if (names.contains(maskNames[i])) resetMode |= masks[i];
+      }
+      scfilters = new ScoringFilters(job);
+      scoreInjected = job.getFloat("db.score.injected", 1.0f);
+      schedule = FetchScheduleFactory.getFetchSchedule(job);
+    }
+    
     public void close() {}
 
     public void reduce(WritableComparable key, Iterator values,
                        OutputCollector output, Reporter reporter)
       throws IOException {
-      output.collect(key, (Writable)values.next()); // just collect first value
+      // there can be at most one value with status != STATUS_INJECTED
+      // and we also use only one value with status == STATUS_INJECTED
+      while (values.hasNext()) {
+        CrawlDatum datum = (CrawlDatum)values.next();
+        if (datum.getStatus() != CrawlDatum.STATUS_INJECTED) {
+          existing = datum;
+        } else {
+          injected = datum;
+        }
+      }
+      // set the status properly
+      if (injected != null) injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+      
+      if (existing != null) {
+        if (injected == null) {
+          output.collect(key, existing);    // no update
+        } else {
+          // check if we need to reset some values in the existing copy
+          if ((resetMode & RESET_SCORE) != 0) {
+            try {
+              scfilters.initialScore((UTF8)key, existing);
+            } catch (Exception e) {
+              LOG.warn("Couldn't filter initial score, key " + key + ": " + e.getMessage());
+              existing.setScore(scoreInjected);
+            }
+          }
+          if ((resetMode & RESET_SCHEDULE) != 0) {
+            schedule.initializeSchedule((UTF8)key, existing);
+          }
+          if ((resetMode & RESET_METADATA) != 0) {
+            existing.setMetaData(new MapWritable());
+          }
+          output.collect(key, existing);
+        }
+      } else {
+        output.collect(key, injected);
+      }
     }
   }
 
Index: src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
@@ -0,0 +1,155 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages, which changed since the last fetchTime, decrease the
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages, which not changed since the last fetchTime, increase the
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next fetchTime
+ * by a fraction of the difference between the last modification time and the last
+ * fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1s).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize the algorithm,
+ * so that the fetch interval either increases or decreases infinitely, with little
+ * relevance to the page changes. Please use the {@link #main(String[])} method to
+ * test the values before applying them in a production system.</p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public class AdaptiveFetchSchedule extends FetchSchedule {
+
+  private float INC_RATE;
+
+  private float DEC_RATE;
+
+  private float MAX_INTERVAL;
+
+  private float MIN_INTERVAL;
+  
+  private boolean SYNC_DELTA;
+
+  private float SYNC_DELTA_RATE;
+  
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null) return;
+    INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year
+    SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+    SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+  }
+
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime,
+          long fetchTime, long modifiedTime, int state) {
+    long refTime = fetchTime;
+    if (modifiedTime <= 0) modifiedTime = fetchTime;
+    float interval = datum.getFetchInterval();
+    switch (state) {
+      case FetchSchedule.STATUS_MODIFIED:
+        interval *= (1.0f - DEC_RATE);
+        break;
+      case FetchSchedule.STATUS_UNMODIFIED:
+        interval *= (1.0f + INC_RATE);
+        break;
+      case FetchSchedule.STATUS_UNKNOWN:
+        break;
+    }
+    datum.setFetchInterval(interval);
+    if (SYNC_DELTA) {
+      // try to synchronize with the time of change
+      long delta = fetchTime - modifiedTime;
+      if (delta > interval) interval = delta;
+      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
+    }
+    if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
+    if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
+    datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval()));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 24L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+    p.setFetchTime(0);
+    System.out.println(p);
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+      System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+              + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss);
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new UTF8("http://www.example.com"), p,
+                p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+                changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_UNMODIFIED);
+        System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+                + (p.getFetchInterval() / (float) (3600 * 24)) + " days");
+        if (!changed) miss++;
+        if (miss > maxMiss) maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+      if (changed) miss++;
+      curTime += delta;
+    }
+    System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+  }
+}

Property changes on: src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 417282)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -72,6 +72,7 @@
     private Partitioner hostPartitioner = new PartitionUrlByHost();
     private URLFilters filters;
     private ScoringFilters scfilters;
+    private FetchSchedule schedule;
     private SelectorEntry entry = new SelectorEntry();
     private FloatWritable sortValue = new FloatWritable();
     private boolean byIP;
@@ -84,6 +85,7 @@
       byIP = job.getBoolean("generate.max.per.host.by.ip", false);
       filters = new URLFilters(job);
       scfilters = new ScoringFilters(job);
+      schedule = FetchScheduleFactory.getFetchSchedule(job);
     }
 
     public void close() {}
@@ -104,12 +106,9 @@
       }
       CrawlDatum crawlDatum = (CrawlDatum)value;
 
-      if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
-        return;                                   // don't retry
+      if (!schedule.shouldFetch((UTF8)key, crawlDatum, curTime))
+        return;
 
-      if (crawlDatum.getFetchTime() > curTime)
-        return;                                   // not time yet
-
       float sort = 1.0f;
       try {
         sort = scfilters.generatorSortValue((UTF8)key, crawlDatum, sort);
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 417282)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -36,12 +36,16 @@
   private CrawlDatum result = new CrawlDatum();
   private ArrayList linked = new ArrayList();
   private ScoringFilters scfilters = null;
+  private FetchSchedule schedule = null;
   private float scoreInjected;
+  private float maxInterval;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
     scfilters = new ScoringFilters(job);
     scoreInjected = job.getFloat("db.score.injected", 1.0f);
+    schedule = FetchScheduleFactory.getFetchSchedule(job);
+    maxInterval = (float)(job.getInt("db.fetch.interval.max", 30 * 3600 * 24));
   }
 
   public void close() {}
@@ -53,6 +57,7 @@
     CrawlDatum highest = null;
     CrawlDatum old = null;
     byte[] signature = null;
+    UTF8 url = (UTF8)key;
     linked.clear();
 
     while (values.hasNext()) {
@@ -76,7 +81,7 @@
       }
     }
 
-    // initialize with the latest version
+    // initialize with the latest version.
     result.set(highest);
     if (old != null) {
       // copy metadata from old, if exists
@@ -91,57 +96,97 @@
         result.setModifiedTime(old.getModifiedTime());
       }
     }
+    
+    int changedStatus = FetchSchedule.STATUS_UNKNOWN;
+    long prevFetchTime = 0L;
+    long prevModifiedTime = 0L;
+    // set old times as an initial reference
+    if (old != null) {
+      prevFetchTime = old.getFetchTime();
+      prevModifiedTime = old.getModifiedTime();
+    }
 
     switch (highest.getStatus()) {                // determine new status
 
     case CrawlDatum.STATUS_DB_UNFETCHED:          // no new entry
     case CrawlDatum.STATUS_DB_FETCHED:
     case CrawlDatum.STATUS_DB_GONE:
-      result.set(old);                            // use old
       break;
 
     case CrawlDatum.STATUS_LINKED:                // highest was link
-      if (old != null) {                          // if old exists
-        result.set(old);                          // use it
-      } else {
+      if (old == null) {                          // new page
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+        result.setModifiedTime(0L);
         try {
-          scfilters.initialScore((UTF8)key, result);
+          scfilters.initialScore(url, result);
         } catch (ScoringFilterException e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter init score for url " + key +
-                     ", using default: " + e.getMessage());
-          }
-          result.setScore(scoreInjected);
+          LOG.warn("Cannot filter init score for url " + key +
+                   ", using default: " + e.getMessage());
         }
+        schedule.initializeSchedule(url, result);
       }
       break;
       
     case CrawlDatum.STATUS_FETCH_SUCCESS:         // succesful fetch
-      if (highest.getSignature() == null) result.setSignature(signature);
+    case CrawlDatum.STATUS_FETCH_UNMODIFIED:
+      int status = result.getStatus();
       result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
-      result.setNextFetchTime();
+      result.setRetriesSinceFetch(0);
+      // apply the FetchSchedule
+      if (status == CrawlDatum.STATUS_FETCH_UNMODIFIED) {
+        changedStatus = FetchSchedule.STATUS_UNMODIFIED;
+        // use old signature, because the new one wasn't computed (no content)
+        result.setSignature(old.getSignature());
+      } else {
+        if (result.getSignature() == null) result.setSignature(signature);
+        // don't believe the protocol layer, check it here ...
+        if (old == null) {
+          changedStatus = FetchSchedule.STATUS_UNKNOWN;
+        } else if (SignatureComparator._compare(old.getSignature(), result.getSignature()) != 0) {
+          changedStatus = FetchSchedule.STATUS_MODIFIED;
+        } else {
+          changedStatus = FetchSchedule.STATUS_UNMODIFIED;
+        }
+        result = schedule.setFetchSchedule((UTF8)key, result,
+                prevFetchTime, prevModifiedTime,
+                result.getFetchTime(), result.getModifiedTime(), changedStatus);
+        // if fetchInterval is larger than the system-wide maximum, trigger
+        // an unconditional re-crawl as soon as possible (the chances are that
+        // it was a border-case already, so that old copies are about to expire).
+        // This prevents pages from being stuck in the UNMODIFIED state, when
+        // the old fetched copy was already phased-out with old segments.
+        if (maxInterval < result.getFetchInterval()) {
+          schedule.forceRefetch(url, result, true);
+        }
+      }
       break;
 
     case CrawlDatum.STATUS_SIGNATURE:
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
-      }   
+      LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);   
       return;
     case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
-      if (old != null)
+      if (old != null) {
         result.setSignature(old.getSignature());  // use old signature
-      if (highest.getRetriesSinceFetch() < retryMax) {
+      }
+      // increase the retry counter
+      result.setRetriesSinceFetch(result.getRetriesSinceFetch() + 1);
+      if (result.getRetriesSinceFetch() < retryMax) {
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
       } else {
         result.setStatus(CrawlDatum.STATUS_DB_GONE);
       }
+      result = schedule.setPageRetrySchedule(url, result,
+              prevFetchTime, prevModifiedTime, result.getFetchTime());
       break;
 
     case CrawlDatum.STATUS_FETCH_GONE:            // permanent failure
-      if (old != null)
+      if (old != null) {
         result.setSignature(old.getSignature());  // use old signature
+        result.setModifiedTime(old.getModifiedTime());
+      }
       result.setStatus(CrawlDatum.STATUS_DB_GONE);
+      result = schedule.setPageGoneSchedule(url, result,
+              prevFetchTime, prevModifiedTime, result.getFetchTime());
       break;
 
     default:
@@ -151,11 +196,9 @@
     try {
       scfilters.updateDbScore((UTF8)key, result, linked);
     } catch (Exception e) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't update score, key=" + key + ": " + e);
-      }
+      LOG.warn("Couldn't update score, key=" + key + ": " + e);
     }
     output.collect(key, result);
   }
-
+  
 }
Index: src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
===================================================================
--- src/java/org/apache/nutch/crawl/FetchScheduleFactory.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/FetchScheduleFactory.java	(revision 0)
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+
+/** Creates and caches a {@link FetchSchedule} implementation. */
+public class FetchScheduleFactory {
+
+  public static final Log LOG = LogFactory.getLog(FetchScheduleFactory.class);
+
+  private FetchScheduleFactory() {}                   // no public ctor
+
+  /** Return the FetchSchedule implementation. */
+  public static FetchSchedule getFetchSchedule(Configuration conf) {
+    String clazz = conf.get("db.fetch.schedule.class", DefaultFetchSchedule.class.getName());
+    FetchSchedule impl = (FetchSchedule)conf.getObject(clazz);
+    if (impl == null) {
+      try {
+        LOG.info("Using FetchSchedule impl: " + clazz);
+        Class implClass = Class.forName(clazz);
+        impl = (FetchSchedule)implClass.newInstance();
+        impl.setConf(conf);
+        conf.setObject(clazz, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create " + clazz, e);
+      }
+    }
+    return impl;
+  }
+}

Property changes on: src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
===================================================================
--- src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java	(revision 417282)
+++ src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java	(working copy)
@@ -21,7 +21,6 @@
 
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.SuffixStringMatcher;
-import org.apache.nutch.util.TrieStringMatcher;
 
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 417282)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -50,18 +50,10 @@
 
   int maxContentLength;
 
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
   private Configuration conf;
 
   // constructor
   public File() {
-    this.httpDateFormat = new HttpDateFormat();
   }
 
   /** Set the point at which content is truncated. */
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(revision 417282)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(working copy)
@@ -26,6 +26,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 
 // Hadoop imports
@@ -80,7 +81,7 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    return new Content(orig, base, content,
+    return new Content(orig, base, (content != null) ? content : new byte[0],
                        getHeader(Response.CONTENT_TYPE),
                        headers, this.conf);
   }
@@ -137,7 +138,11 @@
         this.code = 300;  // http redirect
         return;
       }
-
+      if (f.lastModified() <= datum.getModifiedTime()) {
+        this.code = 304;
+        this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
+        return;
+      }
       if (f.isDirectory()) {
         getDirAsHttpResponse(f);
       } else if (f.isFile()) {
@@ -190,9 +195,10 @@
 
     // set headers
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
+    headers.set(Response.CONTENT_TYPE, "");   // No Content-Type at file protocol level
     headers.set(Response.LAST_MODIFIED,
-      this.file.httpDateFormat.toString(f.lastModified()));
-    headers.set(Response.CONTENT_TYPE, "");   // No Content-Type at file protocol level
+            HttpDateFormat.toString(f.lastModified()));
+    
 
     // response code
     this.code = 200; // http OK
@@ -210,7 +216,7 @@
       new Integer(this.content.length).toString());
     headers.set(Response.CONTENT_TYPE, "text/html");
     headers.set(Response.LAST_MODIFIED,
-      this.file.httpDateFormat.toString(f.lastModified()));
+            HttpDateFormat.toString(f.lastModified()));
 
     // response code
     this.code = 200; // http OK
@@ -234,7 +240,7 @@
     for (int i=0; i<list.length; i++) {
       f = list[i];
       String name = f.getName();
-      String time = this.file.httpDateFormat.toString(f.lastModified());
+      String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
         // java 1.4.2 api says dir itself and parent dir are not listed
         // so the following is not needed.
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(revision 417282)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(working copy)
@@ -29,6 +29,7 @@
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -193,13 +194,17 @@
       
       int code = response.getCode();
       byte[] content = response.getContent();
+      long lastModified = 0L;
+      String modified = response.getHeader("Last-Modified");
+      if (modified != null) lastModified = HttpDateFormat.toLong(modified);
       Content c = new Content(u.toString(), u.toString(),
                               (content == null ? EMPTY_CONTENT : content),
                               response.getHeader("Content-Type"),
                               response.getHeaders(), this.conf);
       
       if (code == 200) { // got a good response
-        return new ProtocolOutput(c); // return it
+        return new ProtocolOutput(c,
+                new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified)); // return it
         
       } else if (code == 410) { // page is gone
         return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
@@ -231,7 +236,8 @@
             protocolStatusCode = ProtocolStatus.MOVED;
         }
         // handle this in the higher layer.
-        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
+        return new ProtocolOutput(c,
+                new ProtocolStatus(protocolStatusCode, u, lastModified));
       } else if (code == 400) { // bad request, mark as GONE
         if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); }
         return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(revision 417282)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(working copy)
@@ -27,6 +27,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.util.LogUtil;
 
@@ -77,7 +78,7 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    return new Content(orig, base, content,
+    return new Content(orig, base, (content != null) ? content : new byte[0],
                        getHeader(Response.CONTENT_TYPE),
                        headers, this.conf);
   }
@@ -256,9 +257,9 @@
       this.content = null;
 
       if (path.endsWith("/")) {
-        getDirAsHttpResponse(path);
+        getDirAsHttpResponse(path, datum.getModifiedTime());
       } else {
-        getFileAsHttpResponse(path);
+        getFileAsHttpResponse(path, datum.getModifiedTime());
       }
 
       // reset next renewalTime, take the lesser
@@ -267,7 +268,7 @@
           + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);
         if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
           Ftp.LOG.info("reset renewalTime to "
-            +ftp.httpDateFormat.toString(ftp.renewalTime));
+            + HttpDateFormat.toString(ftp.renewalTime));
         }
       }
 
@@ -307,7 +308,7 @@
   }
 
   // get ftp file as http response
-  private void getFileAsHttpResponse(String path)
+  private void getFileAsHttpResponse(String path, long lastModified)
     throws IOException {
 
     ByteArrayOutputStream os = null;
@@ -318,15 +319,21 @@
       list = new LinkedList();
       ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
 
-      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
-      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
-
       FTPFile ftpFile = (FTPFile) list.get(0);
       this.headers.set(Response.CONTENT_LENGTH,
                        new Long(ftpFile.getSize()).toString());
       //this.headers.put("content-type", "text/html");
       this.headers.set(Response.LAST_MODIFIED,
-                       ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
+              HttpDateFormat.toString(ftpFile.getTimestamp()));
+      
+      // don't retrieve the file if not changed.
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+
+      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
+      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
       this.content = os.toByteArray();
 
 //      // approximate bytes sent and read
@@ -365,7 +372,7 @@
                        new Long(ftpFile.getSize()).toString());
       //this.headers.put("content-type", "text/html");
       this.headers.set(Response.LAST_MODIFIED,
-                      ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
+                      HttpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
 
 //      // approximate bytes sent and read
@@ -374,7 +381,9 @@
 //        this.httpAccounting.incrementBytesRead(this.content.length);
 //      }
 
-      this.code = 200; // http OK
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        this.code = 304;
+      } else this.code = 200;
 
     } catch (FtpExceptionCanNotHaveDataConnection e) {
 
@@ -403,7 +412,7 @@
   }
 
   // get ftp dir list as http response
-  private void getDirAsHttpResponse(String path)
+  private void getDirAsHttpResponse(String path, long lastModified)
     throws IOException {
     List list = new LinkedList();
 
@@ -487,7 +496,7 @@
     for (int i=0; i<list.size(); i++) {
       FTPFile f = (FTPFile) list.get(i);
       String name = f.getName();
-      String time = ftp.httpDateFormat.toString(f.getTimestamp());
+      String time = HttpDateFormat.toString(f.getTimestamp());
       if (f.isDirectory()) {
         // some ftp server LIST "." and "..", we skip them here
         if (name.equals(".") || name.equals(".."))
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 417282)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -76,19 +76,11 @@
   // ftp dir list entry parser
   FTPFileEntryParser parser = null;
 
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
   private Configuration conf;
 
 
   // constructor
   public Ftp() {
-    this.httpDateFormat = new HttpDateFormat();
   }
 
   /** Set the timeout. */
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(revision 417282)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(working copy)
@@ -31,10 +31,15 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.hadoop.conf.*;
+import org.apache.hadoop.io.UTF8;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.util.*;
 
@@ -276,18 +281,14 @@
   }
   
   public static void main(String[] args) throws Exception {
-    //LOG.setLevel(Level.FINE);
     String name = args[0];
-    String url = "file:"+name;
-    File file = new File(name);
-    byte[] bytes = new byte[(int)file.length()];
-    DataInputStream in = new DataInputStream(new FileInputStream(file));
-    in.readFully(bytes);
     Configuration conf = NutchConfiguration.create();
+    ProtocolFactory factory = new ProtocolFactory(conf);
+    Protocol p = factory.getProtocol(name);
+    ProtocolOutput output = p.getProtocolOutput(new UTF8(name), new CrawlDatum());
     HtmlParser parser = new HtmlParser();
     parser.setConf(conf);
-    Parse parse = parser.getParse(
-            new Content(url, url, bytes, "text/html", new Metadata(), conf));
+    Parse parse = parser.getParse(output.getContent());
     System.out.println("data: "+parse.getData());
 
     System.out.println("text: "+parse.getText());
Index: src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
===================================================================
--- src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(revision 417282)
+++ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(working copy)
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.Date;
 
 // Commons Logging imports
 import org.apache.commons.logging.Log;
@@ -36,6 +35,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
 import org.apache.nutch.util.LogUtil;
@@ -76,6 +76,9 @@
     GetMethod get = new GetMethod(this.orig);
     get.setFollowRedirects(followRedirects);
     get.setRequestHeader("User-Agent", http.getUserAgent());
+    if (datum.getModifiedTime() > 0L) {
+      get.setRequestHeader("If-Modified-Since", HttpDateFormat.toString(datum.getModifiedTime()));
+    }
     HttpMethodParams params = get.getParams();
     if (http.getUseHttp11()) {
       params.setVersion(HttpVersion.HTTP_1_1);
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision 417282)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working copy)
@@ -30,6 +30,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
@@ -109,6 +110,10 @@
       reqStr.append(host);
       reqStr.append(portString);
       reqStr.append("\r\n");
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " +
+                HttpDateFormat.toString(datum.getModifiedTime()) + "\r\n");
+      }
 
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
 
