Index: src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages, which changed since the last fetchTime, decrease the
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages, which not changed since the last fetchTime, increase the
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next fetchTime
+ * by a fraction of the difference between the last modification time and the last
+ * fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1s).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize the algorithm,
+ * so that the fetch interval either increases or decreases infinitely, with little
+ * relevance to the page changes. Please use the {@link #main(String[])} method to
+ * test the values before applying them in a production system.</p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public class AdaptiveFetchSchedule implements FetchSchedule {
+
+  private float INC_RATE;
+
+  private float DEC_RATE;
+
+  private float MAX_INTERVAL;
+
+  private float MIN_INTERVAL;
+  
+  private boolean SYNC_DELTA;
+
+  private float SYNC_DELTA_RATE;
+  
+  Configuration conf = null;
+  
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    if (conf == null) return;
+    INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year
+    SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+    SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.5f);
+  }
+
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed) {
+    boolean chg = true; // assume it's changed
+    if (changed != null) chg = changed.booleanValue();
+    long refTime = fetchTime;
+    if (modifiedTime <= 0) modifiedTime = fetchTime;
+    float interval = datum.getFetchInterval();
+    if (chg) {
+      interval *= (1.0f - DEC_RATE);
+    } else {
+      interval *= (1.0f + INC_RATE);
+    }
+    datum.setFetchInterval(interval);
+    if (SYNC_DELTA) {
+      // try to synchronize with the time of change
+      long delta = fetchTime - modifiedTime;
+      if (delta > interval) interval = delta;
+      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
+    }
+    if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
+    if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
+    datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval()));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 2L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 10, 1.0f);
+    p.setFetchTime(0);
+    System.out.println(p);
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+      System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+              + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss);
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new UTF8("http://www.example.com"), p, curTime, lastModified, new Boolean(changed));
+        float interval = p.getFetchInterval();
+        System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+                + (p.getFetchInterval() / (float) (3600 * 24)) + " days");
+        if (!changed) miss++;
+        if (miss > maxMiss) maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+      if (changed) miss++;
+      curTime += delta;
+    }
+    System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+  }
+}
+
+
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java	(revision 492176)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java	(working copy)
@@ -12,6 +12,9 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
+ *
+ * Modified by Armel T. Nene 12/01/07
+ * Email armel DOT nene AT idna-solutions DOT com
  */
 
 package org.apache.nutch.crawl;
@@ -40,7 +43,9 @@
   public static final byte STATUS_FETCH_SUCCESS = 5;
   public static final byte STATUS_FETCH_RETRY = 6;
   public static final byte STATUS_FETCH_GONE = 7;
+  public static final byte STATUS_FETCH_UNMODIFIED = 8;
   
+  
   public static final String[] statNames = {
     "signature",
     "DB_unfetched",
@@ -49,7 +54,8 @@
     "linked",
     "fetch_success",
     "fetch_retry",
-    "fetch_gone"
+    "fetch_gone",
+    "fetch_unmodified"
   };
 
   private static final float MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;
@@ -60,7 +66,7 @@
   private float fetchInterval;
   private float score = 1.0f;
   private byte[] signature = null;
-  private long modifiedTime;
+  private long modifiedTime = 0L;
   private MapWritable metaData;
 
   public CrawlDatum() {}
@@ -85,9 +91,9 @@
   public long getFetchTime() { return fetchTime; }
   public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
 
-  public void setNextFetchTime() {
-    fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);
-  }
+//  public void setNextFetchTime() {
+//    fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);
+//  }
 
   public long getModifiedTime() {
     return modifiedTime;
@@ -288,7 +294,8 @@
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
     buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
-    buf.append("Retry interval: " + getFetchInterval() + " days\n");
+    buf.append("Retry interval: " + getFetchInterval() + " seconds (" +
+            (getFetchInterval() / 24.0f / 3600.0f) + " days)\n");
     buf.append("Score: " + getScore() + "\n");
     buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
     buf.append("Metadata: " + (metaData != null ? metaData.toString() : "null") + "\n");
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 492176)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -31,129 +31,180 @@
 
 /** Merge new page entries with existing entries. */
 public class CrawlDbReducer implements Reducer {
-  public static final Log LOG = LogFactory.getLog(CrawlDbReducer.class);
-  private int retryMax;
-  private CrawlDatum result = new CrawlDatum();
-  private ArrayList linked = new ArrayList();
-  private ScoringFilters scfilters = null;
-
-  public void configure(JobConf job) {
-    retryMax = job.getInt("db.fetch.retry.max", 3);
-    scfilters = new ScoringFilters(job);
-  }
-
-  public void close() {}
-
-  public void reduce(WritableComparable key, Iterator values,
-                     OutputCollector output, Reporter reporter)
-    throws IOException {
-
-    CrawlDatum highest = null;
-    CrawlDatum old = null;
-    byte[] signature = null;
-    linked.clear();
-
-    while (values.hasNext()) {
-      CrawlDatum datum = (CrawlDatum)values.next();
-
-      if (highest == null || datum.getStatus() > highest.getStatus()) {
-        highest = datum;                          // find highest status
-      }
-
-      switch (datum.getStatus()) {                // find old entry, if any
-      case CrawlDatum.STATUS_DB_UNFETCHED:
-      case CrawlDatum.STATUS_DB_FETCHED:
-      case CrawlDatum.STATUS_DB_GONE:
-        old = datum;
-        break;
-      case CrawlDatum.STATUS_LINKED:
-        linked.add(datum);
-        break;
-      case CrawlDatum.STATUS_SIGNATURE:
-        signature = datum.getSignature();
-      }
+    public static final Log LOG = LogFactory.getLog(CrawlDbReducer.class);
+    private int retryMax;
+    private float maxInterval;
+    private JobConf job;
+    private CrawlDatum result = new CrawlDatum();
+    private ArrayList linked = new ArrayList();
+    private ScoringFilters scfilters = null;
+    
+    public void configure(JobConf job) {
+        this.job = job;
+        retryMax = job.getInt("db.fetch.retry.max", 3);
+        scfilters = new ScoringFilters(job);
+        maxInterval = (float)(job.getInt("db.max.fetch.interval", 30) * 3600 * 24);
     }
-
-    // initialize with the latest version
-    result.set(highest);
-    if (old != null) {
-      // copy metadata from old, if exists
-      if (old.getMetaData().size() > 0) {
-        result.getMetaData().putAll(old.getMetaData());
-        // overlay with new, if any
-        if (highest.getMetaData().size() > 0)
-          result.getMetaData().putAll(highest.getMetaData());
-      }
-      // set the most recent valid value of modifiedTime
-      if (old.getModifiedTime() > 0 && highest.getModifiedTime() == 0) {
-        result.setModifiedTime(old.getModifiedTime());
-      }
-    }
-
-    switch (highest.getStatus()) {                // determine new status
-
-    case CrawlDatum.STATUS_DB_UNFETCHED:          // no new entry
-    case CrawlDatum.STATUS_DB_FETCHED:
-    case CrawlDatum.STATUS_DB_GONE:
-      result.set(old);                            // use old
-      break;
-
-    case CrawlDatum.STATUS_LINKED:                // highest was link
-      if (old != null) {                          // if old exists
-        result.set(old);                          // use it
-      } else {
-        result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    
+    public void close() {}
+    
+    public void reduce(WritableComparable key, Iterator values,
+            OutputCollector output, Reporter reporter)
+            throws IOException {
+        
+        CrawlDatum highest = null;
+        CrawlDatum old = null;
+        byte[] signature = null;
+        linked.clear();
+        
+        while (values.hasNext()) {
+            CrawlDatum datum = (CrawlDatum)values.next();
+            
+            if (highest == null || datum.getStatus() > highest.getStatus()) {
+                highest = datum;                          // find highest status
+            }
+            
+            switch (datum.getStatus()) {                // find old entry, if any
+                case CrawlDatum.STATUS_DB_UNFETCHED:
+                case CrawlDatum.STATUS_DB_FETCHED:
+                case CrawlDatum.STATUS_DB_GONE:
+                    old = datum;
+                    break;
+                case CrawlDatum.STATUS_LINKED:
+                    linked.add(datum);
+                    break;
+                case CrawlDatum.STATUS_SIGNATURE:
+                    signature = datum.getSignature();
+            }
+        }
+        
+        // initialize with the latest version
+        result.set(highest);
+        if (old != null) {
+            // copy metadata from old, if exists
+            if (old.getMetaData().size() > 0) {
+                result.getMetaData().putAll(old.getMetaData());
+                // overlay with new, if any
+                if (highest.getMetaData().size() > 0)
+                    result.getMetaData().putAll(highest.getMetaData());
+            }
+            // set the most recent valid value of modifiedTime
+            if (old.getModifiedTime() > 0 && highest.getModifiedTime() == 0) {
+                result.setModifiedTime(old.getModifiedTime());
+            }
+        }
+        
+        switch (highest.getStatus()) {                // determine new status
+            
+            case CrawlDatum.STATUS_DB_UNFETCHED:          // no new entry
+            case CrawlDatum.STATUS_DB_FETCHED:
+            case CrawlDatum.STATUS_DB_GONE:
+                result.set(old);                            // use old
+                break;
+                
+            case CrawlDatum.STATUS_LINKED:                // highest was link
+                if (old != null) {                          // if old exists
+                    result.set(old);                          // use it
+                } else {
+                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+                    try {
+                        scfilters.initialScore((UTF8)key, result);
+                    } catch (ScoringFilterException e) {
+                        if (LOG.isWarnEnabled()) {
+                            LOG.warn("Cannot filter init score for url " + key +
+                                    ", using default: " + e.getMessage());
+                        }
+                        result.setScore(0.0f);
+                    }
+                }
+                break;
+                
+            case CrawlDatum.STATUS_FETCH_SUCCESS:         // succesful fetch
+                if (highest.getSignature() == null) result.setSignature(signature);
+                result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
+                result.setModifiedTime(0L);
+                break;
+                
+            case CrawlDatum.STATUS_FETCH_UNMODIFIED:      // succesful fetch, not modified
+                result = highest;                           // use new entry
+                int status = result.getStatus();
+                result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
+                result.setRetriesSinceFetch(0);
+                // apply the FetchSchedule
+                long fetchTime = result.getFetchTime();
+                long modifiedTime = result.getModifiedTime();
+                // set old times as an initial reference
+                result.setFetchTime(old.getFetchTime());
+                result.setModifiedTime(old.getModifiedTime());
+                boolean changed = true;
+                if (status == CrawlDatum.STATUS_FETCH_UNMODIFIED) {
+                    changed = false;
+                    // use the old signature, because the new one wasn't computed (no content)
+                    result.setSignature(old.getSignature());
+                } else {
+                    if (result.getSignature() == null) result.setSignature(signature);
+                    // don't believe the protocol layer blindly, check it here...
+                    changed =
+                            SignatureComparator._compare(old.getSignature(), result.getSignature()) != 0;
+                }
+                FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(job);
+                result = schedule.setFetchSchedule((UTF8)key, result, fetchTime,
+                        modifiedTime, new Boolean(changed));
+                // if fetchInterval is larger than the system-wide maximum, trigger
+                // an unconditional recrawl. This prevents the page to be stuck at
+                // UNMODIFIED state, when the old fetched copy was already removed with
+                // old segments.
+                if (maxInterval < result.getFetchInterval()) {
+                    // reduce the fetch interval so that it fits within the max period
+                    result.setFetchInterval(maxInterval * 0.9f);
+                    // return to the original state of ignorance
+                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+                    result.setSignature(null);
+                    result.setModifiedTime(0);
+                }
+                break;
+                
+                
+            case CrawlDatum.STATUS_SIGNATURE:
+                if (LOG.isWarnEnabled()) {
+                    LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
+                }
+                return;
+            case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
+                if (old != null)
+                    result.setSignature(old.getSignature());  // use old signature
+                if (highest.getRetriesSinceFetch() < retryMax) {
+                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+                } else {
+                    result.setStatus(CrawlDatum.STATUS_DB_GONE);
+                }
+                break;
+                
+            case CrawlDatum.STATUS_FETCH_GONE:            // permanent failure
+                if (old != null) {
+                    result.setSignature(old.getSignature());  // use old signature
+                    result.setModifiedTime(old.getModifiedTime());  // use old modifiedTime
+                }
+                result.setStatus(CrawlDatum.STATUS_DB_GONE);
+                // no page is truly gone - just increase the interval by 50% and
+                // try much, much later ...
+                result.setFetchInterval(result.getFetchInterval() * 1.5f);
+                result.setFetchTime(result.getFetchTime() + Math.round(1000.0 * result.getFetchInterval()));
+                
+                break;
+                
+            default:
+                throw new RuntimeException("Unknown status: " + highest.getStatus() + " " + key);
+        }
+        
         try {
-          scfilters.initialScore((UTF8)key, result);
-        } catch (ScoringFilterException e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter init score for url " + key +
-                     ", using default: " + e.getMessage());
-          }
-          result.setScore(0.0f);
+            scfilters.updateDbScore((UTF8)key, old, result, linked);
+        } catch (Exception e) {
+            if (LOG.isWarnEnabled()) {
+                LOG.warn("Couldn't update score, key=" + key + ": " + e);
+            }
         }
-      }
-      break;
-      
-    case CrawlDatum.STATUS_FETCH_SUCCESS:         // succesful fetch
-      if (highest.getSignature() == null) result.setSignature(signature);
-      result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
-      result.setNextFetchTime();
-      break;
-
-    case CrawlDatum.STATUS_SIGNATURE:
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
-      }   
-      return;
-    case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
-      if (old != null)
-        result.setSignature(old.getSignature());  // use old signature
-      if (highest.getRetriesSinceFetch() < retryMax) {
-        result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
-      } else {
-        result.setStatus(CrawlDatum.STATUS_DB_GONE);
-      }
-      break;
-
-    case CrawlDatum.STATUS_FETCH_GONE:            // permanent failure
-      if (old != null)
-        result.setSignature(old.getSignature());  // use old signature
-      result.setStatus(CrawlDatum.STATUS_DB_GONE);
-      break;
-
-    default:
-      throw new RuntimeException("Unknown status: " + highest.getStatus() + " " + key);
+        output.collect(key, result);
     }
-
-    try {
-      scfilters.updateDbScore((UTF8)key, old, result, linked);
-    } catch (Exception e) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't update score, key=" + key + ": " + e);
-      }
-    }
-    output.collect(key, result);
-  }
-
+    
 }
Index: src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This class implements the default re-fetch schedule. That is, no matter
+ * if the page was changed or not, the <code>fetchInterval</code> remains
+ * unchanged, and the updated page fetchTime will always be set to
+ * <code>fetchTime  fetchInterval * 1000</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class DefaultFetchSchedule implements FetchSchedule {
+  Configuration conf = null;
+  
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed) {
+    datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0f));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+}
+
Index: src/java/org/apache/nutch/crawl/FetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This interface defines the contract for implementations that calculate
+ * fetch times and re-fetch intervals.
+ * 
+ * @author Andrzej Bialecki
+ */
+public interface FetchSchedule extends Configurable {
+  
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page.
+   * Implementations may use supplied arguments to support different re-fetching
+   * schedules.
+   * 
+   * @param url url of the page
+   * @param datum page description to be adjusted. NOTE: this instance, passed by reference,
+   * may be modified inside the method.
+   * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule
+   * implementations should update the value in {@param datum} to this value.
+   * @param modifiedTime last time the content was modified. This information comes from
+   * the protocol implementations, or is set to < 0 if not available. Most FetchSchedule
+   * implementations should update the value in {@param datum} to this value.
+   * @param changed if Boolean.TRUE, then the content is considered to be "changed" before the
+   * <code>fetchTime</code>, if Boolean.FALSE then the content is unchanged. This information may be
+   * obtained by comparing page signatures before and after fetching. If null, then
+   * it is unknown at this moment; implementations are free to choose a sensible default value.
+   * @return adjusted page information, including all original information. NOTE: this may
+   * be a different instance than {@param datum}, but implementations should make sure that
+   * it contains all information from {@param datum}.
+   */
+  public abstract CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed);
+}
+
Index: src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
===================================================================
--- src/java/org/apache/nutch/crawl/FetchScheduleFactory.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/FetchScheduleFactory.java	(revision 0)
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.logging.Logger;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+
+/** Creates and caches a {@link FetchSchedule} implementation. */
+public class FetchScheduleFactory {
+
+  public static final Logger LOG = LogFormatter
+    .getLogger(FetchScheduleFactory.class.getName());
+
+  private FetchScheduleFactory() {}                   // no public ctor
+
+  /** Return the FetchSchedule implementation. */
+  public static FetchSchedule getFetchSchedule(Configuration conf) {
+    String clazz = conf.get("db.fetch.schedule.class", DefaultFetchSchedule.class.getName());
+    FetchSchedule impl = (FetchSchedule)conf.getObject(clazz);
+    if (impl == null) {
+      try {
+        LOG.info("Using FetchSchedule impl: " + clazz);
+        Class implClass = Class.forName(clazz);
+        impl = (FetchSchedule)implClass.newInstance();
+        impl.setConf(conf);
+        conf.setObject(clazz, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create " + clazz, e);
+      }
+    }
+    return impl;
+  }
+}
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 492176)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -71,6 +71,7 @@
     private long curTime;
     private long limit;
     private long count;
+    private float maxInterval;
     private HashMap hostCounts = new HashMap();
     private int maxPerHost;
     private Partitioner hostPartitioner = new PartitionUrlByHost();
@@ -108,8 +109,19 @@
       }
       CrawlDatum crawlDatum = (CrawlDatum)value;
 
-      if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
-        return;                                   // don't retry
+            /* No page is truly gone - we handle this by increasing fetchInterval
+       if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
+         return;                                   // don't retry
+        */
+ 
+      if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_GONE) {
+        // first, force re-crawl of very old pages.
+        if (crawlDatum.getFetchTime() - curTime > maxInterval) {
+          // shave-off 10% so that we catch it for sure.
+          crawlDatum.setFetchInterval(maxInterval * 0.9f);
+          crawlDatum.setFetchTime(curTime);
+        }
+      }
 
       if (crawlDatum.getFetchTime() > curTime)
         return;                                   // not time yet
Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java	(revision 492176)
+++ src/java/org/apache/nutch/crawl/Injector.java	(working copy)
@@ -44,7 +44,8 @@
   public static class InjectMapper implements Mapper {
     private UrlNormalizer urlNormalizer;
     private float interval;
-    private float scoreInjected;
+    private float initialScore;
+   // private float scoreInjected;
     private JobConf jobConf;
     private URLFilters filters;
     private ScoringFilters scfilters; 
@@ -55,7 +56,9 @@
       interval = jobConf.getFloat("db.default.fetch.interval", 30f);
       filters = new URLFilters(jobConf);
       scfilters = new ScoringFilters(jobConf);
-      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
+      //scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
+      interval = job.getFloat("db.default.fetch.interval", 30.0f * 3600.0f * 24.0f);
+      initialScore = job.getFloat("db.score.injected", 1.0f);
     }
 
     public void close() {}
@@ -76,7 +79,7 @@
       if (url != null) {                          // if it passes
         value.set(url);                           // collect it
         CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
-        datum.setScore(scoreInjected);
+        datum.setScore(initialScore);
         try {
           scfilters.injectedScore(value, datum);
         } catch (ScoringFilterException e) {
@@ -84,7 +87,7 @@
             LOG.warn("Cannot filter injected score for url " + url +
                      ", using default (" + e.getMessage() + ")");
           }
-          datum.setScore(scoreInjected);
+          datum.setScore(initialScore);
         }
         output.collect(value, datum);
       }
Index: src/java/org/apache/nutch/indexer/IndexingFilters.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilters.java	(revision 492176)
+++ src/java/org/apache/nutch/indexer/IndexingFilters.java	(working copy)
@@ -33,14 +33,22 @@
 
 /** Creates and caches {@link IndexingFilter} implementing plugins.*/
 public class IndexingFilters {
-
-  public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
-
-  private IndexingFilter[] indexingFilters;
-
-  public IndexingFilters(Configuration conf) {
-      this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName()); 
-      if (this.indexingFilters == null) {
+    
+    public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
+    
+    private IndexingFilter[] indexingFilters;
+    
+    public IndexingFilters(Configuration conf) {
+        this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName());
+        /* Get indexingfilter.order property */
+        String order = conf.get("indexingfilter.order");
+        if (this.indexingFilters == null) {
+            
+            /* If ordered filters are required, prepare array of filters based on property */
+            String[] orderedFilters = null;
+            if (order != null && !order.trim().equals("")) {
+                orderedFilters = order.split("\\s+");
+            }
             try {
                 ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID);
                 if (point == null)
@@ -51,27 +59,40 @@
                     Extension extension = extensions[i];
                     IndexingFilter filter = (IndexingFilter) extension.getExtensionInstance();
                     if (LOG.isInfoEnabled()) {
-                      LOG.info("Adding " + filter.getClass().getName());
+                        LOG.info("Adding " + filter.getClass().getName());
                     }
                     if (!filterMap.containsKey(filter.getClass().getName())) {
                         filterMap.put(filter.getClass().getName(), filter);
                     }
                 }
                 conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+                /* If no ordered filters required, just get the filters in an indeterminate order */
+                if (orderedFilters == null) {
+                    conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+                    
+                    /* Otherwise run the filters in the required order */
+                } else {
+                    IndexingFilter[] filter = new IndexingFilter[orderedFilters.length];
+                    for (int i = 0; i < orderedFilters.length; i++) {
+                        filter[i] = (IndexingFilter) filterMap
+                                .get(orderedFilters[i]);
+                    }
+                    conf.setObject(IndexingFilter.class.getName(), filter);
+                } 
             } catch (PluginRuntimeException e) {
-                throw new RuntimeException(e);
-            }
+                    throw new RuntimeException(e);
+                }
             this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName());
         }
-  }                  
-
-  /** Run all defined filters. */
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+    }
+    
+    /** Run all defined filters. */
+    public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
-    for (int i = 0; i < this.indexingFilters.length; i++) {
-      doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
+        for (int i = 0; i < this.indexingFilters.length; i++) {
+            doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
+        }
+        
+        return doc;
     }
-
-    return doc;
-  }
 }
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 492176)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -43,124 +43,140 @@
  * @author John Xing
  ***********************************/
 public class File implements Protocol {
-
-  public static final Log LOG = LogFactory.getLog(File.class);
-
-  static final int MAX_REDIRECTS = 5;
-
-  int maxContentLength;
-
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
-  private Configuration conf;
-
-  // constructor
-  public File() {
-    this.httpDateFormat = new HttpDateFormat();
-  }
-
-  /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {maxContentLength = length;}
-
-  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
-    String urlString = url.toString();
-    try {
-      URL u = new URL(urlString);
-  
-      int redirects = 0;
-  
-      while (true) {
-        FileResponse response;
-        response = new FileResponse(u, datum, this, getConf());   // make a request
-  
-        int code = response.getCode();
-  
-        if (code == 200) {                          // got a good response
-          return new ProtocolOutput(response.toContent());              // return it
-  
-        } else if (code >= 300 && code < 400) {     // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FileException("Too many redirects: " + url);
-          u = new URL(response.getHeader("Location"));
-          redirects++;                
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("redirect to " + u); 
-          }
-  
-        } else {                                    // convert to exception
-          throw new FileError(code);
+    
+    public static final Log LOG = LogFactory.getLog(File.class);
+    
+    static final int MAX_REDIRECTS = 5;
+    
+    int maxContentLength;
+    
+    // 20040412, xing
+    // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
+    // are placed in each thread before we check out if they're thread-safe.
+    
+    // http date format
+    HttpDateFormat httpDateFormat = null;
+    
+    private Configuration conf;
+    
+    // constructor
+    public File() {
+        this.httpDateFormat = new HttpDateFormat();
+    }
+    
+    /** Set the point at which content is truncated. */
+    public void setMaxContentLength(int length) {maxContentLength = length;}
+    
+    public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+        String urlString = url.toString();
+        ProtocolOutput res = null;
+        
+        try {
+            URL u = new URL(urlString);
+            
+//      int redirects = 0;
+            FileResponse response;
+            response = new FileResponse(u, datum, this, getConf());   // make a request
+            int code = response.getCode();
+            Content content = response.toContent();
+            // we don't know yet
+            long lastModified = 0L;
+            String modified = response.getHeader("Last-Modified");
+            if (modified != null) lastModified = HttpDateFormat.toLong(modified);
+            
+            switch (code) {
+                case 200:                          // got a good response
+                    res = new ProtocolOutput(content,
+                            new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified));              // return it
+                    break;
+                case 304:                         // not modified
+                    res = new ProtocolOutput(content,
+                            new ProtocolStatus(ProtocolStatus.NOTMODIFIED, lastModified));              // return it
+                    break;
+                case 300:                         // redirect
+                    u = new URL(response.getHeader("Location"));
+                    res = new ProtocolOutput(content,
+                            new ProtocolStatus(ProtocolStatus.MOVED, u));
+                    break;
+                case 401:                         // not authorized
+                    res = new ProtocolOutput(content,
+                            new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                            + u.toString()));
+                    break;
+                case 404:                         // not found
+                    res = new ProtocolOutput(content,
+                            new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+                    break;
+                default:                                    // convert to exception
+                    throw new FileError(code);
+            }
+            
+        } catch (Exception e) {
+            return new ProtocolOutput(null, new ProtocolStatus(e));
         }
-      } 
-    } catch (Exception e) {
-      return new ProtocolOutput(null, new ProtocolStatus(e));
+        return res;
     }
-  }
-
+    
 //  protected void finalize () {
 //    // nothing here
 //  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    int maxContentLength = Integer.MIN_VALUE;
-    String logLevel = "info";
-    boolean dumpContent = false;
-    String urlString = null;
-
-    String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
+    
+    /** For debugging. */
+    public static void main(String[] args) throws Exception {
+        int maxContentLength = Integer.MIN_VALUE;
+        String logLevel = "info";
+        boolean dumpContent = false;
+        String urlString = null;
+        
+        String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
+        
+        if (args.length == 0) {
+            System.err.println(usage);
+            System.exit(-1);
+        }
+        
+        for (int i = 0; i < args.length; i++) {
+            if (args[i].equals("-logLevel")) {
+                logLevel = args[++i];
+            } else if (args[i].equals("-maxContentLength")) {
+                maxContentLength = Integer.parseInt(args[++i]);
+            } else if (args[i].equals("-dumpContent")) {
+                dumpContent = true;
+            } else if (i != args.length-1) {
+                System.err.println(usage);
+                System.exit(-1);
+            } else
+                urlString = args[i];
+        }
+        
+        File file = new File();
+        
+        if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+            file.setMaxContentLength(maxContentLength);
+        
+        // set log level
+        //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+        
+        Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+        
+        System.err.println("Content-Type: " + content.getContentType());
+        System.err.println("Content-Length: " +
+                content.getMetadata().get(Response.CONTENT_LENGTH));
+        System.err.println("Last-Modified: " +
+                content.getMetadata().get(Response.LAST_MODIFIED));
+        if (dumpContent) {
+            System.out.print(new String(content.getContent()));
+        }
+        
+        file = null;
     }
-      
-    for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-logLevel")) {
-        logLevel = args[++i];
-      } else if (args[i].equals("-maxContentLength")) {
-        maxContentLength = Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-dumpContent")) {
-        dumpContent = true;
-      } else if (i != args.length-1) {
-        System.err.println(usage);
-        System.exit(-1);
-      } else
-        urlString = args[i];
+    
+    public void setConf(Configuration conf) {
+        this.conf = conf;
+        this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
     }
-
-    File file = new File();
-
-    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
-      file.setMaxContentLength(maxContentLength);
-
-    // set log level
-    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
-    Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-
-    System.err.println("Content-Type: " + content.getContentType());
-    System.err.println("Content-Length: " +
-                       content.getMetadata().get(Response.CONTENT_LENGTH));
-    System.err.println("Last-Modified: " +
-                       content.getMetadata().get(Response.LAST_MODIFIED));
-    if (dumpContent) {
-      System.out.print(new String(content.getContent()));
+    
+    public Configuration getConf() {
+        return this.conf;
     }
-
-    file = null;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
 }
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(revision 492176)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(working copy)
@@ -18,14 +18,13 @@
 
 // JDK imports
 import java.net.URL;
-import java.util.Date;
-import java.util.TreeMap;
 import java.io.IOException;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 
 // Hadoop imports
@@ -80,9 +79,9 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    return new Content(orig, base, content,
-                       getHeader(Response.CONTENT_TYPE),
-                       headers, this.conf);
+    return new Content(orig, base, (content != null) ? content : new byte[0],
+                        getHeader(Response.CONTENT_TYPE),
+                        headers, this.conf);
   }
   
   public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
@@ -138,6 +137,13 @@
         return;
       }
 
+      if (f.lastModified() <= datum.getModifiedTime()) {
+        this.code = 304;
+        this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
+        return;
+      }
+
+
       if (f.isDirectory()) {
         getDirAsHttpResponse(f);
       } else if (f.isFile()) {
@@ -191,7 +197,7 @@
     // set headers
     headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
     headers.set(Response.LAST_MODIFIED,
-      this.file.httpDateFormat.toString(f.lastModified()));
+      HttpDateFormat.toString(f.lastModified()));
     headers.set(Response.CONTENT_TYPE, "");   // No Content-Type at file protocol level
 
     // response code
@@ -203,14 +209,15 @@
     throws IOException {
 
     String path = f.toString();
-    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
+    this.content = list2html(f.listFiles(), path, false);
 
+
     // set headers
     headers.set(Response.CONTENT_LENGTH,
       new Integer(this.content.length).toString());
     headers.set(Response.CONTENT_TYPE, "text/html");
     headers.set(Response.LAST_MODIFIED,
-      this.file.httpDateFormat.toString(f.lastModified()));
+       HttpDateFormat.toString(f.lastModified()));
 
     // response code
     this.code = 200; // http OK
@@ -234,12 +241,8 @@
     for (int i=0; i<list.length; i++) {
       f = list[i];
       String name = f.getName();
-      String time = this.file.httpDateFormat.toString(f.lastModified());
+      String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
-        // java 1.4.2 api says dir itself and parent dir are not listed
-        // so the following is not needed.
-        //if (name.equals(".") || name.equals(".."))
-        //  continue;
         x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
         x.append(time+"\t-\n");
       } else if (f.isFile()) {
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 492176)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -46,195 +46,210 @@
  * @author John Xing
  ***********************************/
 public class Ftp implements Protocol {
-
-  public static final Log LOG = LogFactory.getLog(Ftp.class);
-
-  static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
-
-  static final int MAX_REDIRECTS = 5;
-
-  int timeout;
-
-  int maxContentLength;
-
-  String userName;
-  String passWord; 
-
-  // typical/default server timeout is 120*1000 millisec.
-  // better be conservative here
-  int serverTimeout;
-
-  // when to have client start anew
-  long renewalTime = -1;
-
-  boolean keepConnection;
-
-  boolean followTalk;
-
-  // ftp client
-  Client client = null;
-  // ftp dir list entry parser
-  FTPFileEntryParser parser = null;
-
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
-  private Configuration conf;
-
-
-  // constructor
-  public Ftp() {
-    this.httpDateFormat = new HttpDateFormat();
-  }
-
-  /** Set the timeout. */
-  public void setTimeout(int to) {
-    timeout = to;
-  }
-
-  /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {
-    maxContentLength = length;
-  }
-
-  /** Set followTalk */
-  public void setFollowTalk(boolean followTalk) {
-    this.followTalk = followTalk;
-  }
-
-  /** Set keepConnection */
-  public void setKeepConnection(boolean keepConnection) {
-    this.keepConnection = keepConnection;
-  }
-
-  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
-    String urlString = url.toString();
-    try {
-      URL u = new URL(urlString);
-  
-      int redirects = 0;
-  
-      while (true) {
-        FtpResponse response;
-        response = new FtpResponse(u, datum, this, getConf());   // make a request
-  
-        int code = response.getCode();
-  
-        if (code == 200) {                          // got a good response
-          return new ProtocolOutput(response.toContent());              // return it
-  
-        } else if (code >= 300 && code < 400) {     // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FtpException("Too many redirects: " + url);
-          u = new URL(response.getHeader("Location"));
-          redirects++;                
-          if (LOG.isTraceEnabled()) {
-            LOG.trace("redirect to " + u); 
-          }
-        } else {                                    // convert to exception
-          throw new FtpError(code);
+    
+    public static final Log LOG = LogFactory.getLog(Ftp.class);
+    
+    static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
+    
+    static final int MAX_REDIRECTS = 5;
+    
+    int timeout;
+    
+    int maxContentLength;
+    
+    String userName;
+    String passWord;
+    
+    // typical/default server timeout is 120*1000 millisec.
+    // better be conservative here
+    int serverTimeout;
+    
+    // when to have client start anew
+    long renewalTime = -1;
+    
+    boolean keepConnection;
+    
+    boolean followTalk;
+    
+    // ftp client
+    Client client = null;
+    // ftp dir list entry parser
+    FTPFileEntryParser parser = null;
+    
+    // 20040412, xing
+    // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
+    // are placed in each thread before we check out if they're thread-safe.
+    
+    // http date format
+    HttpDateFormat httpDateFormat = null;
+    
+    private Configuration conf;
+    
+    
+    // constructor
+    public Ftp() {
+        this.httpDateFormat = new HttpDateFormat();
+    }
+    
+    /** Set the timeout. */
+    public void setTimeout(int to) {
+        timeout = to;
+    }
+    
+    /** Set the point at which content is truncated. */
+    public void setMaxContentLength(int length) {
+        maxContentLength = length;
+    }
+    
+    /** Set followTalk */
+    public void setFollowTalk(boolean followTalk) {
+        this.followTalk = followTalk;
+    }
+    
+    /** Set keepConnection */
+    public void setKeepConnection(boolean keepConnection) {
+        this.keepConnection = keepConnection;
+    }
+    
+    public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+        String urlString = url.toString();
+        ProtocolOutput res = null;
+        try {
+            URL u = new URL(urlString);
+            
+            FtpResponse response;
+            response = new FtpResponse(u, datum, this, getConf());   // make a request
+            
+            int code = response.getCode();
+            Content c = response.toContent();
+            // we don't know yet
+            long lastModified = 0L;
+            String modified = response.getHeader("Last-Modified");
+            if (modified != null) lastModified = HttpDateFormat.toLong(modified);
+            
+            switch (code) {
+                case 200:                          // got a good response
+                    res = new ProtocolOutput(c,
+                            new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified));
+                    break;
+                case 300:
+                    u = new URL(u, response.getHeader("Location"));
+                    res = new ProtocolOutput(c,
+                            new ProtocolStatus(ProtocolStatus.MOVED, u));
+                    break;
+                case 400:   // bad request
+                    LOG.info("400 Bad request: " + urlString);
+                    res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+                    break;
+                case 401:   // auth required
+                    LOG.info("401 Authentication Required");
+                    res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                            + urlString));
+                    break;
+                case 404:
+                    res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+                    break;
+                default:
+                    throw new FtpError(code);
+            }
+            
+        } catch (Exception e) {
+            return new ProtocolOutput(null, new ProtocolStatus(e));
         }
-      } 
-    } catch (Exception e) {
-      return new ProtocolOutput(null, new ProtocolStatus(e));
+        return res;
     }
-  }
-
-  protected void finalize () {
-    try {
-      if (this.client != null && this.client.isConnected()) {
-        this.client.logout();
-        this.client.disconnect();
-      }
-    } catch (IOException e) {
-      // do nothing
+    
+    protected void finalize() {
+        try {
+            if (this.client != null && this.client.isConnected()) {
+                this.client.logout();
+                this.client.disconnect();
+            }
+        } catch (IOException e) {
+            // do nothing
+        }
     }
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    int timeout = Integer.MIN_VALUE;
-    int maxContentLength = Integer.MIN_VALUE;
-    String logLevel = "info";
-    boolean followTalk = false;
-    boolean keepConnection = false;
-    boolean dumpContent = false;
-    String urlString = null;
-
-    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
+    
+    /** For debugging. */
+    public static void main(String[] args) throws Exception {
+        int timeout = Integer.MIN_VALUE;
+        int maxContentLength = Integer.MIN_VALUE;
+        String logLevel = "info";
+        boolean followTalk = false;
+        boolean keepConnection = false;
+        boolean dumpContent = false;
+        String urlString = null;
+        
+        String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
+        
+        if (args.length == 0) {
+            System.err.println(usage);
+            System.exit(-1);
+        }
+        
+        for (int i = 0; i < args.length; i++) {
+            if (args[i].equals("-logLevel")) {
+                logLevel = args[++i];
+            } else if (args[i].equals("-followTalk")) {
+                followTalk = true;
+            } else if (args[i].equals("-keepConnection")) {
+                keepConnection = true;
+            } else if (args[i].equals("-timeout")) {
+                timeout = Integer.parseInt(args[++i]) * 1000;
+            } else if (args[i].equals("-maxContentLength")) {
+                maxContentLength = Integer.parseInt(args[++i]);
+            } else if (args[i].equals("-dumpContent")) {
+                dumpContent = true;
+            } else if (i != args.length-1) {
+                System.err.println(usage);
+                System.exit(-1);
+            } else {
+                urlString = args[i];
+            }
+        }
+        
+        Ftp ftp = new Ftp();
+        
+        ftp.setFollowTalk(followTalk);
+        ftp.setKeepConnection(keepConnection);
+        
+        if (timeout != Integer.MIN_VALUE) // set timeout
+            ftp.setTimeout(timeout);
+        
+        if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+            ftp.setMaxContentLength(maxContentLength);
+        
+        // set log level
+        //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+        
+        Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+        
+        System.err.println("Content-Type: " + content.getContentType());
+        System.err.println("Content-Length: " +
+                content.getMetadata().get(Response.CONTENT_LENGTH));
+        System.err.println("Last-Modified: " +
+                content.getMetadata().get(Response.LAST_MODIFIED));
+        if (dumpContent) {
+            System.out.print(new String(content.getContent()));
+        }
+        
+        ftp = null;
     }
-      
-    for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-logLevel")) {
-        logLevel = args[++i];
-      } else if (args[i].equals("-followTalk")) {
-        followTalk = true;
-      } else if (args[i].equals("-keepConnection")) {
-        keepConnection = true;
-      } else if (args[i].equals("-timeout")) {
-        timeout = Integer.parseInt(args[++i]) * 1000;
-      } else if (args[i].equals("-maxContentLength")) {
-        maxContentLength = Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-dumpContent")) {
-        dumpContent = true;
-      } else if (i != args.length-1) {
-        System.err.println(usage);
-        System.exit(-1);
-      } else {
-        urlString = args[i];
-      }
+    
+    
+    public void setConf(Configuration conf) {
+        this.conf = conf;
+        this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+        this.timeout = conf.getInt("ftp.timeout", 10000);
+        this.userName = conf.get("ftp.username", "anonymous");
+        this.passWord = conf.get("ftp.password", "anonymous@example.com");
+        this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+        this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+        this.followTalk = conf.getBoolean("ftp.follow.talk", false);
     }
-
-    Ftp ftp = new Ftp();
-
-    ftp.setFollowTalk(followTalk);
-    ftp.setKeepConnection(keepConnection);
-
-    if (timeout != Integer.MIN_VALUE) // set timeout
-      ftp.setTimeout(timeout);
-
-    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
-      ftp.setMaxContentLength(maxContentLength);
-
-    // set log level
-    //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
-    Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
-
-    System.err.println("Content-Type: " + content.getContentType());
-    System.err.println("Content-Length: " +
-                       content.getMetadata().get(Response.CONTENT_LENGTH));
-    System.err.println("Last-Modified: " +
-                      content.getMetadata().get(Response.LAST_MODIFIED));
-    if (dumpContent) {
-      System.out.print(new String(content.getContent()));
+    
+    public Configuration getConf() {
+        return this.conf;
     }
-
-    ftp = null;
-  }
-
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
-    this.timeout = conf.getInt("ftp.timeout", 10000);
-    this.userName = conf.get("ftp.username", "anonymous");
-    this.passWord = conf.get("ftp.password", "anonymous@example.com");
-    this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
-    this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
-    this.followTalk = conf.getBoolean("ftp.follow.talk", false);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
+    
 }
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(revision 492176)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(working copy)
@@ -20,6 +20,7 @@
 import org.apache.commons.net.ftp.FTP;
 import org.apache.commons.net.ftp.FTPFile;
 import org.apache.commons.net.ftp.FTPReply;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 
 import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
@@ -77,9 +78,9 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    return new Content(orig, base, content,
-                       getHeader(Response.CONTENT_TYPE),
-                       headers, this.conf);
+    return new Content(orig, base, (content != null) ? content : new byte[0],
+                        getHeader(Response.CONTENT_TYPE),
+                        headers, this.conf);
   }
 
   public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
@@ -256,9 +257,9 @@
       this.content = null;
 
       if (path.endsWith("/")) {
-        getDirAsHttpResponse(path);
+        getDirAsHttpResponse(path, datum.getModifiedTime());
       } else {
-        getFileAsHttpResponse(path);
+        getFileAsHttpResponse(path, datum.getModifiedTime());
       }
 
       // reset next renewalTime, take the lesser
@@ -267,7 +268,7 @@
           + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);
         if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
           Ftp.LOG.info("reset renewalTime to "
-            +ftp.httpDateFormat.toString(ftp.renewalTime));
+            +HttpDateFormat.toString(ftp.renewalTime));
         }
       }
 
@@ -307,8 +308,8 @@
   }
 
   // get ftp file as http response
-  private void getFileAsHttpResponse(String path)
-    throws IOException {
+  private void getFileAsHttpResponse(String path, long lastModified)
+     throws IOException {
 
     ByteArrayOutputStream os = null;
     List list = null;
@@ -316,17 +317,24 @@
     try {
       // first get its possible attributes
       list = new LinkedList();
-      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
+      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);      
 
-      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
-      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
-
       FTPFile ftpFile = (FTPFile) list.get(0);
       this.headers.set(Response.CONTENT_LENGTH,
                        new Long(ftpFile.getSize()).toString());
       //this.headers.put("content-type", "text/html");
       this.headers.set(Response.LAST_MODIFIED,
-                       ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
+                       HttpDateFormat.toString(ftpFile.getTimestamp()));
+      
+      // don't retrieve the file if not changed.
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+
+      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
+      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
+
       this.content = os.toByteArray();
 
 //      // approximate bytes sent and read
@@ -335,8 +343,11 @@
 //        this.httpAccounting.incrementBytesRead(this.content.length);
 //      }
 
-      this.code = 200; // http OK
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        this.code = 304;
+      } else this.code = 200;
 
+
     } catch (FtpExceptionControlClosedByForcedDataClose e) {
 
       // control connection is off, clean up
@@ -403,7 +414,7 @@
   }
 
   // get ftp dir list as http response
-  private void getDirAsHttpResponse(String path)
+ private void getDirAsHttpResponse(String path, long lastModified)
     throws IOException {
     List list = new LinkedList();
 
@@ -487,7 +498,7 @@
     for (int i=0; i<list.size(); i++) {
       FTPFile f = (FTPFile) list.get(i);
       String name = f.getName();
-      String time = ftp.httpDateFormat.toString(f.getTimestamp());
+      String time = HttpDateFormat.toString(f.getTimestamp());
       if (f.isDirectory()) {
         // some ftp server LIST "." and "..", we skip them here
         if (name.equals(".") || name.equals(".."))
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision 492176)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working copy)
@@ -30,6 +30,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.http.api.HttpBase;
@@ -109,6 +110,10 @@
       reqStr.append(host);
       reqStr.append(portString);
       reqStr.append("\r\n");
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " +
+                HttpDateFormat.toString(datum.getModifiedTime()) + "\r\n");
+      }
 
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
 
Index: src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
===================================================================
--- src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(revision 492176)
+++ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(working copy)
@@ -17,11 +17,12 @@
 
 // JDK imports
 import java.io.ByteArrayOutputStream;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.Date;
 
+
 // Commons Logging imports
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -76,6 +77,9 @@
     GetMethod get = new GetMethod(this.orig);
     get.setFollowRedirects(followRedirects);
     get.setRequestHeader("User-Agent", http.getUserAgent());
+    if (datum.getModifiedTime() > 0L) {
+      get.setRequestHeader("If-Modified-Since", HttpDateFormat.toString(datum.getModifiedTime()));
+    }
     HttpMethodParams params = get.getParams();
     if (http.getUseHttp11()) {
       params.setVersion(HttpVersion.HTTP_1_1);
