Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 359822)
+++ conf/nutch-default.xml	(working copy)
@@ -209,6 +209,16 @@
 </property>
 
 <property>
+  <name>db.max.fetch.interval</name>
+  <value>60</value>
+  <description>The maximum number of days between re-fetches of a page.
+  There is an assumption that segments older than this number of days will
+  be removed from the system, thus automatically requesting a re-fetch of
+  the corresponding entries in CrawlDb.
+  </description>
+</property>
+
+<property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from
@@ -263,6 +273,57 @@
 </property>
 
 <property>
+  <name>db.fetch.schedule.class</name>
+  <value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
+  <description>The default implementation of fetch schedule. It simply
+  adds the original fetchInterval to the last fetch time, regardless of
+  page changes.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.inc_rate</name>
+  <value>0.2</value>
+  <description>If a page is unmodified, its fetchInterval will be
+  increased by this rate.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.dec_rate</name>
+  <value>0.2</value>
+  <description>If a page is modified, its fetchInterval will be
+  decreased by this rate.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.min_interval</name>
+  <value>60.0</value>
+  <description>Minimum fetchInterval, in seconds.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.max_interval</name>
+  <value>31536000.0</value>
+  <description>Maximum fetchInterval, in seconds (365 days).
+  NOTE: this is limited by db.max.fetch.interval. Pages with
+  fetchInterval larger than db.max.fetch.interval
+  will be fetched anyway.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.sync_delta</name>
+  <value>true</value>
+  <description>If true, try to synchronize with the time of page change.
+  by shifting the next fetchTime by a fraction (sync_rate) of the difference
+  between the last modification time, and the last fetch time.</description>
+</property>
+
+<property>
+  <name>db.fetch.schedule.adaptive.sync_delta_rate</name>
+  <value>0.5</value>
+  <description>See sync_delta for description.</description>
+</property>
+
+<property>
   <name>db.signature.class</name>
   <value>org.apache.nutch.crawl.MD5Signature</value>
   <description>The default implementation of a page signature. Signatures
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 359822)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -113,7 +113,7 @@
               redirecting = false;
               LOG.fine("redirectCount=" + redirectCount);
               Protocol protocol = ProtocolFactory.getProtocol(url);
-              ProtocolOutput output = protocol.getProtocolOutput(key, datum);
+              ProtocolOutput output = protocol.getProtocolOutput(key, datum);              
               ProtocolStatus status = output.getStatus();
               Content content = output.getContent();
 
@@ -124,6 +124,11 @@
                 updateStatus(content.getContent().length);
                 break;
 
+              case ProtocolStatus.NOTMODIFIED:        // got an unmodified page
+                output(key, datum, content, CrawlDatum.STATUS_FETCH_UNMODIFIED);
+                updateStatus(content.getContent().length);
+                break;
+
               case ProtocolStatus.MOVED:         // redirect
               case ProtocolStatus.TEMP_MOVED:
                 String newUrl = status.getMessage();
@@ -142,7 +147,6 @@
               case ProtocolStatus.EXCEPTION:
                 logError(url, status.getMessage());
               case ProtocolStatus.RETRY:          // retry
-                datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
                 output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
                 break;
                 
@@ -150,7 +154,6 @@
               case ProtocolStatus.NOTFOUND:
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
-              case ProtocolStatus.NOTMODIFIED:
                 output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
                 break;
 
@@ -193,6 +196,8 @@
                         Content content, int status) {
 
       datum.setStatus(status);
+      // update the fetchTime
+      datum.setFetchTime(System.currentTimeMillis());
 
       if (content == null) {
         String url = key.toString();
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java	(revision 359822)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java	(working copy)
@@ -41,6 +41,7 @@
   public static final byte STATUS_FETCH_SUCCESS = 5;
   public static final byte STATUS_FETCH_RETRY = 6;
   public static final byte STATUS_FETCH_GONE = 7;
+  public static final byte STATUS_FETCH_UNMODIFIED = 8;
   
   public static final String[] statNames = {
     "signature",
@@ -50,18 +51,17 @@
     "linked",
     "fetch_success",
     "fetch_retry",
-    "fetch_gone"
+    "fetch_gone",
+    "fetch_unmodified"
   };
 
-  private static final float MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;
-
   private byte status;
   private long fetchTime = System.currentTimeMillis();
   private byte retries;
   private float fetchInterval;
   private float score = 1.0f;
   private byte[] signature = null;
-  private long modifiedTime;
+  private long modifiedTime = 0L; // unknown
 
   public CrawlDatum() {}
 
@@ -85,10 +85,6 @@
   public long getFetchTime() { return fetchTime; }
   public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
 
-  public void setNextFetchTime() {
-    fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);
-  }
-
   public long getModifiedTime() {
     return modifiedTime;
   }
Index: src/java/org/apache/nutch/crawl/FetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/FetchSchedule.java	(revision 0)
@@ -0,0 +1,62 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigured;
+
+/**
+ * This interface defines the contract for implementations that calculate
+ * fetch times and re-fetch intervals.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public abstract class FetchSchedule extends NutchConfigured {
+  
+  public FetchSchedule() {
+    this(null);
+  }
+  
+  public FetchSchedule(NutchConf conf) {
+    super(conf);
+  }
+  
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page.
+   * Implementations may use supplied arguments to support different re-fetching
+   * schedules.
+   * 
+   * @param url url of the page
+   * @param datum page description to be adjusted. NOTE: this instance, passed by reference,
+   * may be modified inside the method.
+   * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule
+   * implementations should update the value in {@param datum} to this value.
+   * @param modifiedTime last time the content was modified. This information comes from
+   * the protocol implementations, or is set to < 0 if not available. Most FetchSchedule
+   * implementations should update the value in {@param datum} to this value.
+   * @param changed if Boolean.TRUE, then the content is considered to be "changed" before the
+   * <code>fetchTime</code>, if Boolean.FALSE then the content is unchanged. This information may be
+   * obtained by comparing the page signatures before and after fetching. If null, then
+   * it is unknown at this moment; implementations are free to choose a sensible default value.
+   * @return adjusted page information, including all original information. NOTE: this may
+   * be a different instance than {@param datum}, but implementations should make sure that
+   * it contains all information from {@param datum}.
+   */
+  public abstract CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed);
+}

Property changes on: src/java/org/apache/nutch/crawl/FetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java	(revision 0)
@@ -0,0 +1,45 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.util.NutchConf;
+
+/**
+ * This class implements the default re-fetch schedule. That is, no matter
+ * if the page was changed or not, the <code>fetchInterval</code> remains
+ * unchanged, and the updated page fetchTime will always be set to
+ * <code>fetchTime + fetchInterval * 1000</code>.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class DefaultFetchSchedule extends FetchSchedule {
+  
+  public DefaultFetchSchedule() {
+    this(null);
+  }
+  
+  public DefaultFetchSchedule(NutchConf conf) {
+    super(conf);
+  }
+
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed) {
+    datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0f));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+}

Property changes on: src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java	(revision 359668)
+++ src/java/org/apache/nutch/crawl/Injector.java	(working copy)
@@ -51,7 +51,7 @@
         url = urlNormalizer.normalize(url);       // normalize the url
         url = URLFilters.filter(url);             // filter the url
       } catch (Exception e) {
-        LOG.warning("Skipping " +url+":"+e);
+        LOG.warning("Skipping " + url + ":" + e);
         url = null;
       }
       if (url != null) {                          // if it passes
Index: src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
===================================================================
--- src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
+++ src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java	(revision 0)
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigured;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages, which changed since the last fetchTime, decrease the
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages, which not changed since the last fetchTime, increase the
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next fetchTime
+ * by a fraction of the difference between the last modification time and the last
+ * fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1s).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize the algorithm,
+ * so that the fetch interval either increases or decreases infinitely, with little
+ * relevance to the page changes. Please use the {@link #main(String[])} method to
+ * test the values before applying them in a production system.</p>
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class AdaptiveFetchSchedule extends FetchSchedule {
+
+  private float INC_RATE;
+
+  private float DEC_RATE;
+
+  private float MAX_INTERVAL;
+
+  private float MIN_INTERVAL;
+  
+  private boolean SYNC_DELTA;
+
+  private float SYNC_DELTA_RATE;
+  
+  public AdaptiveFetchSchedule() {
+    super(null);
+  }
+  
+  public AdaptiveFetchSchedule(NutchConf conf) {
+    super(null);
+    setConf(conf);
+  }
+  
+  public void setConf(NutchConf conf) {
+    super.setConf(conf);
+    if (conf == null) return;
+    INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year
+    SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+    SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.5f);
+  }
+
+  public CrawlDatum setFetchSchedule(UTF8 url, CrawlDatum datum, long fetchTime, long modifiedTime, Boolean changed) {
+    boolean chg = true; // assume it's changed
+    if (changed != null) chg = changed.booleanValue();
+    long refTime = fetchTime;
+    if (modifiedTime <= 0) modifiedTime = fetchTime;
+    float interval = datum.getFetchInterval();
+    if (chg) {
+      interval *= (1.0f - DEC_RATE);
+    } else {
+      interval *= (1.0f + INC_RATE);
+    }
+    datum.setFetchInterval(interval);
+    if (SYNC_DELTA) {
+      // try to synchronize with the time of change
+      long delta = fetchTime - modifiedTime;
+      if (delta > interval) interval = delta;
+      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
+    }
+    if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
+    if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
+    datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval()));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new AdaptiveFetchSchedule(NutchConf.get());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 2L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 10, 1.0f);
+    p.setFetchTime(0);
+    System.out.println(p);
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+      System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+              + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss);
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new UTF8("http://www.example.com"), p, curTime, lastModified, new Boolean(changed));
+        float interval = p.getFetchInterval();
+        System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
+                + (p.getFetchInterval() / (float) (3600 * 24)) + " days");
+        if (!changed) miss++;
+        if (miss > maxMiss) maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+      if (changed) miss++;
+      curTime += delta;
+    }
+    System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+  }
+}

Property changes on: src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 359668)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -39,6 +39,7 @@
     private long curTime;
     private long limit;
     private long count;
+    private float maxInterval;
     private HashMap hostCounts = new HashMap();
     private int maxPerHost;
     private Partitioner hostPartitioner = new PartitionUrlByHost();
@@ -47,6 +48,7 @@
       curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
       limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
       maxPerHost = job.getInt("generate.max.per.host", -1);
+      maxInterval = (float)job.getInt("db.max.fetch.interval", 60) * 3600.0f * 24.0f;
     }
 
     /** Select & invert subset due for fetch. */
@@ -58,9 +60,16 @@
       if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
         return;                                   // don't retry
 
+      // first, force re-crawl of very old pages
+      if (crawlDatum.getFetchTime() - curTime > maxInterval) {
+        crawlDatum.setFetchInterval(maxInterval * 0.9f);
+        crawlDatum.setFetchTime(curTime);
+      }
+      
       if (crawlDatum.getFetchTime() > curTime)
         return;                                   // not time yet
-
+      
+      
       output.collect(crawlDatum, key);          // invert for sort by score
     }
 
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 359822)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -16,20 +16,20 @@
 
 package org.apache.nutch.crawl;
 
-import java.net.URL;
 import java.util.Iterator;
 import java.io.IOException;
 
 import org.apache.nutch.io.*;
-import org.apache.nutch.util.*;
 import org.apache.nutch.mapred.*;
 
 /** Merge new page entries with existing entries. */
 public class CrawlDbReducer implements Reducer {
   private int retryMax;
+  private JobConf job;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
+    this.job = job;
   }
 
   public void reduce(WritableComparable key, Iterator values,
@@ -83,14 +83,50 @@
       break;
       
     case CrawlDatum.STATUS_FETCH_SUCCESS:         // succesful fetch
+    case CrawlDatum.STATUS_FETCH_UNMODIFIED:      // succesful fetch, but not modified
       result = highest;                           // use new entry
-      if (highest.getSignature() == null) highest.setSignature(signature);
+      int status = result.getStatus();
       result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
-      result.setNextFetchTime();
+      result.setRetriesSinceFetch(0);
+      // apply the FetchSchedule
+      long fetchTime = result.getFetchTime();
+      long modifiedTime = result.getModifiedTime();
+      // set old times as an initial reference
+      result.setFetchTime(old.getFetchTime());
+      result.setModifiedTime(old.getModifiedTime());
+      boolean changed = true;
+      if (status == CrawlDatum.STATUS_FETCH_UNMODIFIED) {
+        changed = false;
+        // use the old signature, because the new one wasn't computed (no content)
+        result.setSignature(old.getSignature());
+      } else {
+        if (result.getSignature() == null) result.setSignature(signature);
+        // don't believe the protocol layer blindly, check it here...
+        changed =
+          SignatureComparator._compare(old.getSignature(), result.getSignature()) != 0;
+      }
+      FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(job);
+      result = schedule.setFetchSchedule((UTF8)key, result, fetchTime,
+              modifiedTime, new Boolean(changed));
+      // if fetchInterval is larger than the system-wide maximum, trigger
+      // an unconditional recrawl. This prevents the page to be stuck at
+      // UNMODIFIED state, when the old fetched copy was already removed with
+      // old segments.
+      int defaultMax = job.getInt("db.max.fetch.interval", 30); // 30 days
+      float max = (float)defaultMax * 3600.0f * 24.0f;
+      if (max < result.getFetchInterval()) {
+        // reduce the fetch interval so that it fits within the max period
+        result.setFetchInterval(max * 0.9f);
+        // return to the original state of ignorance
+        result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+        result.setSignature(null);
+        result.setModifiedTime(0);
+      }
       break;
-
+      
     case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
       result = highest;                           // use new entry
+      result.setRetriesSinceFetch(result.getRetriesSinceFetch() + 1);
       if (old != null)
         result.setSignature(old.getSignature());  // use old signature
       if (highest.getRetriesSinceFetch() < retryMax) {
Index: src/java/org/apache/nutch/crawl/SignatureFactory.java
===================================================================
--- src/java/org/apache/nutch/crawl/SignatureFactory.java	(revision 359822)
+++ src/java/org/apache/nutch/crawl/SignatureFactory.java	(working copy)
@@ -44,6 +44,7 @@
         Class implClass = Class.forName(clazz);
         impl = (Signature)implClass.newInstance();
         impl.setConf(conf);
+        conf.setObject(clazz, impl);
       } catch (Exception e) {
         throw new RuntimeException("Couldn't create " + clazz, e);
       }
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java	(revision 359822)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java	(working copy)
@@ -37,6 +37,7 @@
                                       String name) throws IOException {
 
     final float interval = job.getFloat("db.default.fetch.interval", 30f);
+    final float extscore = job.getFloat("db.score.link.external", 1.0f);
 
     File text =
       new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
@@ -81,8 +82,10 @@
           Outlink[] links = parse.getData().getOutlinks();
 
           // compute OPIC score contribution
-          float score =
-            Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
+          String scoreString = parse.getData().get(Fetcher.SCORE_KEY);
+          float score = extscore;
+          // this may happen if there was a fetch error.
+          if (scoreString != null) score = Float.parseFloat(scoreString);
           score /= links.length;
                           
           for (int i = 0; i < links.length; i++) {
Index: src/java/org/apache/nutch/parse/ParseUtil.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseUtil.java	(revision 359668)
+++ src/java/org/apache/nutch/parse/ParseUtil.java	(working copy)
@@ -64,7 +64,7 @@
     
     Parse parse = null;
     for (int i=0; i<parsers.length; i++) {
-      LOG.info("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
+      LOG.fine("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
       parse = parsers[i].getParse(content);
       if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
         return parse;
Index: src/java/org/apache/nutch/parse/ParserFactory.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserFactory.java	(revision 359668)
+++ src/java/org/apache/nutch/parse/ParserFactory.java	(working copy)
@@ -237,6 +237,7 @@
     try {
         type = MimeType.clean(contentType);
     } catch (MimeTypeException mte) {
+      mte.printStackTrace();
         LOG.info("Could not clean the content-type [" + contentType +
                  "], Reason is [" + mte + "]. Using its raw version...");
         type = contentType;
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 359668)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -52,16 +52,8 @@
 
   static int maxContentLength = NutchConf.get().getInt("file.content.limit", 64 * 1024);
 
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
   // constructor
   public File() {
-    this.httpDateFormat = new HttpDateFormat();
   }
 
   /** Set the point at which content is truncated. */
@@ -69,35 +61,50 @@
 
   public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
     String urlString = url.toString();
+    ProtocolOutput res = null;
     try {
       URL u = new URL(urlString);
   
-      int redirects = 0;
+      FileResponse response;
+      response = new FileResponse(u, datum);   // make a request
   
-      while (true) {
-        FileResponse response;
-        response = new FileResponse(u, datum, this);   // make a request
+      int code = response.getCode();
+      Content content = response.toContent();
+      // we don't know yet
+      long lastModified = 0L;
+      String modified = response.getHeader("Last-Modified");
+      if (modified != null) lastModified = HttpDateFormat.toLong(modified);
   
-        int code = response.getCode();
-  
-        if (code == 200) {                          // got a good response
-          return new ProtocolOutput(response.toContent());              // return it
-  
-        } else if (code >= 300 && code < 400) {     // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FileException("Too many redirects: " + url);
+      switch (code) {
+        case 200:                          // got a good response
+          res = new ProtocolOutput(content,
+                  new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified));              // return it
+          break;
+        case 304:                         // not modified
+          res = new ProtocolOutput(content,
+                  new ProtocolStatus(ProtocolStatus.NOTMODIFIED, lastModified));              // return it
+          break;
+        case 300:                         // redirect
           u = new URL(response.getHeader("Location"));
-          redirects++;                
-          if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + u); 
-  
-        } else {                                    // convert to exception
+          res = new ProtocolOutput(content,
+                  new ProtocolStatus(ProtocolStatus.MOVED, u));
+          break;
+        case 401:                         // not authorized
+          res = new ProtocolOutput(content,
+                  new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                          + u.toString()));
+          break;
+        case 404:                         // not found
+          res = new ProtocolOutput(content,
+                  new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+          break;
+        default:                                    // convert to exception
           throw new FileError(code);
-        }
-      } 
+      }
     } catch (Exception e) {
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
+    return res;
   }
 
 //  protected void finalize () {
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(revision 359668)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(working copy)
@@ -24,6 +24,7 @@
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -62,8 +63,6 @@
   private int code;
   private ContentProperties headers = new ContentProperties();
 
-  private final File file;
-
   /** Returns the response code. */
   public int getCode() { return code; }
 
@@ -80,12 +79,11 @@
                        headers);
   }
 
-  public FileResponse(URL url, CrawlDatum datum, File file)
+  public FileResponse(URL url, CrawlDatum datum)
     throws FileException, IOException {
 
     this.orig = url.toString();
     this.base = url.toString();
-    this.file = file;
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);
@@ -130,10 +128,15 @@
         return;
       }
 
+      if (f.lastModified() <= datum.getModifiedTime()) {
+        this.code = 304;
+        this.headers.put("Last-Modified", HttpDateFormat.toString(f.lastModified()));
+        return;
+      }
       if (f.isDirectory()) {
-        getDirAsHttpResponse(f);
+        getDirAsHttpResponse(f, datum.getModifiedTime());
       } else if (f.isFile()) {
-        getFileAsHttpResponse(f);
+        getFileAsHttpResponse(f, datum.getModifiedTime());
       } else {
         this.code = 500; // http Internal Server Error
         return;
@@ -146,7 +149,7 @@
   }
 
   // get file as http response
-  private void getFileAsHttpResponse(java.io.File f)
+  private void getFileAsHttpResponse(java.io.File f, long lastModified)
     throws FileException, IOException {
 
     // ignore file of size larger than
@@ -162,8 +165,8 @@
     // capture content
     int len = (int) size;
     
-    if (this.file.maxContentLength > 0 && len > this.file.maxContentLength)
-      len = this.file.maxContentLength;
+    if (File.maxContentLength > 0 && len > File.maxContentLength)
+      len = File.maxContentLength;
 
     this.content = new byte[len];
 
@@ -183,7 +186,7 @@
     hdrs.put("Content-Length", new Long(size).toString());
 
     hdrs.put("Last-Modified",
-      this.file.httpDateFormat.toString(f.lastModified()));
+      HttpDateFormat.toString(f.lastModified()));
 
     hdrs.put("Content-Type", "");   // No Content-Type at file protocol level
 
@@ -194,7 +197,7 @@
   }
 
   // get dir list as http response
-  private void getDirAsHttpResponse(java.io.File f)
+  private void getDirAsHttpResponse(java.io.File f, long lastModified)
     throws IOException {
 
     String path = f.toString();
@@ -209,7 +212,7 @@
     hdrs.put("Content-Type", "text/html");
 
     hdrs.put("Last-Modified",
-      this.file.httpDateFormat.toString(f.lastModified()));
+      HttpDateFormat.toString(f.lastModified()));
 
     this.headers.putAll(hdrs);
 
@@ -235,7 +238,7 @@
     for (int i=0; i<list.length; i++) {
       f = list[i];
       String name = f.getName();
-      String time = this.file.httpDateFormat.toString(f.lastModified());
+      String time = HttpDateFormat.toString(f.lastModified());
       if (f.isDirectory()) {
         // java 1.4.2 api says dir itself and parent dir are not listed
         // so the following is not needed.
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(revision 359668)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java	(working copy)
@@ -25,6 +25,7 @@
 import org.apache.commons.net.ftp.parser.ParserInitializationException;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
@@ -76,7 +77,7 @@
   public byte[] getContent() { return content; }
 
   public Content toContent() {
-    return new Content(orig, base, content,
+    return new Content(orig, base, (content != null) ? content : new byte[0],
                        getHeader("Content-Type"),
                        headers);
   }
@@ -126,14 +127,14 @@
         //  + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);
 
         // timeout for control connection
-        ftp.client.setDefaultTimeout(ftp.timeout);
+        ftp.client.setDefaultTimeout(Ftp.timeout);
         // timeout for data connection
-        ftp.client.setDataTimeout(ftp.timeout);
+        ftp.client.setDataTimeout(Ftp.timeout);
 
         // follow ftp talk?
         if (ftp.followTalk)
           ftp.client.addProtocolCommandListener(
-            new PrintCommandListener(ftp.LOG));
+            new PrintCommandListener(Ftp.LOG));
       }
 
       // quit from previous site if at a different site now
@@ -228,18 +229,18 @@
       this.content = null;
 
       if (path.endsWith("/")) {
-        getDirAsHttpResponse(path);
+        getDirAsHttpResponse(path, datum.getModifiedTime());
       } else {
-        getFileAsHttpResponse(path);
+        getFileAsHttpResponse(path, datum.getModifiedTime());
       }
 
       // reset next renewalTime, take the lesser
       if (ftp.client != null && ftp.keepConnection) {
         ftp.renewalTime = System.currentTimeMillis()
-          + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout : ftp.serverTimeout);
+          + ((Ftp.timeout < ftp.serverTimeout) ? Ftp.timeout : ftp.serverTimeout);
         if (ftp.followTalk)
           Ftp.LOG.info("reset renewalTime to "
-            +ftp.httpDateFormat.toString(ftp.renewalTime));
+            + HttpDateFormat.toString(ftp.renewalTime));
       }
 
       // getDirAsHttpResponse() or getFileAsHttpResponse() above
@@ -252,10 +253,10 @@
       }
       
     } catch (Exception e) {
-      ftp.LOG.warning(""+e);
+      Ftp.LOG.warning(""+e);
       StackTraceElement stes[] = e.getStackTrace();
       for (int i=0; i<stes.length; i++) {
-        ftp.LOG.warning("   "+stes[i].toString());
+        Ftp.LOG.warning("   "+stes[i].toString());
       }
       // for any un-foreseen exception (run time exception or not),
       // do ultimate clean and leave ftp.client for garbage collection
@@ -277,7 +278,7 @@
   }
 
   // get ftp file as http response
-  private void getFileAsHttpResponse(String path)
+  private void getFileAsHttpResponse(String path, long lastModified)
     throws IOException {
 
     ByteArrayOutputStream os = null;
@@ -288,16 +289,22 @@
       list = new LinkedList();
       ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
 
-      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
-      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
-
       FTPFile ftpFile = (FTPFile) list.get(0);
       this.headers.put("Content-Length",
         new Long(ftpFile.getSize()).toString());
       //this.headers.put("content-type", "text/html");
       this.headers.put("Last-Modified",
-        ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
+        HttpDateFormat.toString(ftpFile.getTimestamp()));
+      // don't retrieve the file if not changed.
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);
+      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
+
       this.content = os.toByteArray();
+      
 
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {
@@ -332,17 +339,17 @@
         new Long(ftpFile.getSize()).toString());
       //this.headers.put("content-type", "text/html");
       this.headers.put("Last-Modified",
-        ftp.httpDateFormat.toString(ftpFile.getTimestamp()));
+        HttpDateFormat.toString(ftpFile.getTimestamp()));
       this.content = os.toByteArray();
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        this.code = 304;
+      } else this.code = 200;
 
 //      // approximate bytes sent and read
 //      if (this.httpAccounting != null) {
 //        this.httpAccounting.incrementBytesSent(path.length());
 //        this.httpAccounting.incrementBytesRead(this.content.length);
 //      }
-
-      this.code = 200; // http OK
-
     } catch (FtpExceptionCanNotHaveDataConnection e) {
 
       if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
@@ -368,7 +375,7 @@
   }
 
   // get ftp dir list as http response
-  private void getDirAsHttpResponse(String path)
+  private void getDirAsHttpResponse(String path, long lastModified)
     throws IOException {
     List list = new LinkedList();
 
@@ -382,7 +389,7 @@
 
       // fixme, should we do ftp.client.cwd("/"), back to top dir?
 
-      ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
+      ftp.client.retrieveList(null, list, Ftp.maxContentLength, ftp.parser);
       this.content = list2html(list, path, "/".equals(path) ? false : true);
       this.headers.put("Content-Length",
         new Integer(this.content.length).toString());
@@ -449,7 +456,7 @@
     for (int i=0; i<list.size(); i++) {
       FTPFile f = (FTPFile) list.get(i);
       String name = f.getName();
-      String time = ftp.httpDateFormat.toString(f.getTimestamp());
+      String time = HttpDateFormat.toString(f.getTimestamp());
       if (f.isDirectory()) {
         // some ftp server LIST "." and "..", we skip them here
         if (name.equals(".") || name.equals(".."))
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 359668)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -79,17 +79,8 @@
   // ftp dir list entry parser
   FTPFileEntryParser parser = null;
 
-  // 20040412, xing
-  // the following three: HttpDateFormat, MimetypesFileTypeMap, MagicFile
-  // are placed in each thread before we check out if they're thread-safe.
-
-  // http date format
-  HttpDateFormat httpDateFormat = null;
-
-
   // constructor
   public Ftp() {
-    this.httpDateFormat = new HttpDateFormat();
   }
 
   /** Set the timeout. */
@@ -114,35 +105,49 @@
 
   public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
     String urlString = url.toString();
+    ProtocolOutput res = null;
     try {
       URL u = new URL(urlString);
   
-      int redirects = 0;
-  
-      while (true) {
-        FtpResponse response;
-        response = new FtpResponse(u, datum, this);   // make a request
-  
-        int code = response.getCode();
-  
-        if (code == 200) {                          // got a good response
-          return new ProtocolOutput(response.toContent());              // return it
-  
-        } else if (code >= 300 && code < 400) {     // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new FtpException("Too many redirects: " + url);
-          u = new URL(response.getHeader("Location"));
-          redirects++;                
-          if (LOG.isLoggable(Level.FINE))
-            LOG.fine("redirect to " + u); 
-  
-        } else {                                    // convert to exception
+      FtpResponse response;
+      response = new FtpResponse(u, datum, this);   // make a request
+
+      int code = response.getCode();
+      Content c = response.toContent();
+      // we don't know yet
+      long lastModified = 0L;
+      String modified = response.getHeader("Last-Modified");
+      if (modified != null) lastModified = HttpDateFormat.toLong(modified);
+
+      switch (code) {
+        case 200:                          // got a good response
+          res = new ProtocolOutput(c,
+                  new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified));
+          break;
+        case 300:
+          u = new URL(u, response.getHeader("Location"));
+          res = new ProtocolOutput(c,
+                  new ProtocolStatus(ProtocolStatus.MOVED, u));
+          break;
+        case 400:   // bad request
+          LOG.fine("400 Bad request: " + urlString);
+          res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+          break;
+        case 401:   // auth required
+          LOG.fine("401 Authentication Required");
+          res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                  + urlString));
+          break;
+        case 404:
+          res = new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+          break;
+        default:
           throw new FtpError(code);
-        }
-      } 
+      }
     } catch (Exception e) {
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
+    return res;
   }
 
   protected void finalize () {
Index: src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: src/plugin/protocol-httpclient/lib/commons-httpclient-3.0.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Index: src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
===================================================================
--- src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java	(revision 359668)
+++ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java	(working copy)
@@ -7,12 +7,14 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.UnknownHostException;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import org.apache.commons.httpclient.Credentials;
+import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HostConfiguration;
 import org.apache.commons.httpclient.HttpClient;
 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
@@ -22,6 +24,7 @@
 import org.apache.commons.httpclient.protocol.Protocol;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.io.UTF8;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.util.NutchConf;
@@ -66,17 +69,17 @@
   static String NTLM_HOST = NutchConf.get().get("http.auth.ntlm.host", "");
 
   static {
-    LOG.info("http.proxy.host = " + PROXY_HOST);
-    LOG.info("http.proxy.port = " + PROXY_PORT);
+    LOG.fine("http.proxy.host = " + PROXY_HOST);
+    LOG.fine("http.proxy.port = " + PROXY_PORT);
 
-    LOG.info("http.timeout = " + TIMEOUT);
-    LOG.info("http.content.limit = " + MAX_CONTENT);
-    LOG.info("http.agent = " + AGENT_STRING);
+    LOG.fine("http.timeout = " + TIMEOUT);
+    LOG.fine("http.content.limit = " + MAX_CONTENT);
+    LOG.fine("http.agent = " + AGENT_STRING);
 
-    LOG.info("http.auth.ntlm.username = " + NTLM_USERNAME);
+    LOG.fine("http.auth.ntlm.username = " + NTLM_USERNAME);
 
-    LOG.info("fetcher.server.delay = " + SERVER_DELAY);
-    LOG.info("http.max.delays = " + MAX_DELAYS);
+    LOG.fine("fetcher.server.delay = " + SERVER_DELAY);
+    LOG.fine("http.max.delays = " + MAX_DELAYS);
   }
 
   /**
@@ -196,9 +199,14 @@
 
         int code = response.getCode();
         Content c = response.toContent();
+        // we don't know yet
+        long lastModified = 0L;
+        String modified = response.getHeader("Last-Modified");
+        if (modified != null) lastModified = HttpDateFormat.toLong(modified);
 
         if (code == 200) { // got a good response
-          return new ProtocolOutput(c); // return it
+          return new ProtocolOutput(c,
+                  new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified)); // return it
 
         } else if (code == 410) { // page is gone
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
@@ -230,7 +238,7 @@
               protocolStatusCode = ProtocolStatus.MOVED;
           }
           // handle this in the higher layer.
-          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
+          return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u, lastModified));
         } else if (code == 400) { // bad request, mark as GONE
           LOG.fine("400 Bad request: " + u);
           return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
@@ -361,6 +369,15 @@
     if (PROXY) {
       hostConf.setProxy(PROXY_HOST, PROXY_PORT);
     }
+    ArrayList headers = new ArrayList();
+    // prefer English
+    headers.add(new Header("Accept-Language", "de-de,de;q=0.8,en-us;q=0.5,en;q=0.3"));
+    // prefer UTF-8
+    headers.add(new Header("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"));
+    // prefer understandable formats
+    headers.add(new Header("Accept",
+            "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    hostConf.getParams().setParameter("http.default-headers", headers);
     if (NTLM_USERNAME.length() > 0) {
       Credentials ntCreds = new NTCredentials(NTLM_USERNAME, NTLM_PASSWORD, NTLM_HOST, NTLM_DOMAIN);
       client.getState().setCredentials(new AuthScope(NTLM_HOST, AuthScope.ANY_PORT), ntCreds);
Index: src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
===================================================================
--- src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(revision 359668)
+++ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java	(working copy)
@@ -4,10 +4,12 @@
 package org.apache.nutch.protocol.httpclient;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 
 import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HeaderElement;
 import org.apache.commons.httpclient.HttpVersion;
 
 import org.apache.commons.httpclient.cookie.CookiePolicy;
@@ -47,7 +49,7 @@
    * Returns the value of a named header.
    */
   public String getHeader(String name) {
-    return (String) headers.get(name);
+    return headers.getProperty(name);
   }
 
   public byte[] getContent() {
@@ -71,9 +73,12 @@
     GetMethod get = new GetMethod(this.orig);
     get.setFollowRedirects(followRedirects);
     get.setRequestHeader("User-Agent", Http.AGENT_STRING);
+    if (datum.getModifiedTime() > 0) {
+      get.setRequestHeader("If-Modified-Since", HttpDateFormat.toString(datum.getModifiedTime()));
+    }
     HttpMethodParams params = get.getParams();
-    // some servers cannot digest the new protocol
-    params.setVersion(HttpVersion.HTTP_1_0);
+    // some servers cannot digest the new protocol ...
+    params.setVersion(HttpVersion.HTTP_1_1);
     params.makeLenient();
     params.setContentCharset("UTF-8");
     params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
@@ -88,7 +93,7 @@
       Header[] heads = get.getResponseHeaders();
 
       for (int i = 0; i < heads.length; i++) {
-        headers.put(heads[i].getName(), heads[i].getValue());
+        headers.setProperty(heads[i].getName(), heads[i].getValue());
       }
       // always read content. Sometimes content is useful to find a cause
       // for error.
@@ -108,6 +113,7 @@
         content = out.toByteArray();
         in.close();
       } catch (Exception e) {
+        e.printStackTrace();
         if (code == 200) throw new IOException(e.toString());
         // for codes other than 200 OK, we are fine with empty content
       }
Index: src/plugin/protocol-httpclient/plugin.xml
===================================================================
--- src/plugin/protocol-httpclient/plugin.xml	(revision 359668)
+++ src/plugin/protocol-httpclient/plugin.xml	(working copy)
@@ -10,7 +10,7 @@
          <export name="*"/>
       </library>
       <library name="commons-codec.jar" />
-      <library name="commons-httpclient-3.0-rc2.jar" />
+      <library name="commons-httpclient-3.0.jar" />
    </runtime>
 
    <requires>
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java	(revision 359668)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java	(working copy)
@@ -30,6 +30,7 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.io.UTF8;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.*;
 
 /** An implementation of the Http protocol. */
@@ -177,38 +178,74 @@
     try {
       URL u = new URL(urlString);
 
-      int redirects = 0;
-      while (true) {
-        
-        if (!RobotRulesParser.isAllowed(u))
-          throw new HttpException("Blocked by robots.txt");
-        
-        InetAddress addr = blockAddr(u);
-        HttpResponse response;
-        try {
-          response = new HttpResponse(u, datum); // make a request
-        } finally {
-          unblockAddr(addr);
-        }
-        
-        int code = response.getCode();
-        
-        if (code == 200) {                        // got a good response
-          return new ProtocolOutput(response.toContent());            // return it
-          
-        } else if (code == 410) {                 // page is gone
-          throw new HttpException("Http: " + code);
+      if (!RobotRulesParser.isAllowed(u))
+        throw new HttpException("Blocked by robots.txt");
+      
+      InetAddress addr = blockAddr(u);
+      HttpResponse response;
+      try {
+        response = new HttpResponse(u, datum); // make a request
+      } finally {
+        unblockAddr(addr);
+      }
+      
+      int code = response.getCode();
+      
+      Content c = response.toContent();
+      // we don't know yet
+      long lastModified = 0L;
+      String modified = response.getHeader("Last-Modified");
+      if (modified != null) lastModified = HttpDateFormat.toLong(modified);
 
-        } else if (code >= 300 && code < 400) {   // handle redirect
-          if (redirects == MAX_REDIRECTS)
-            throw new HttpException("Too many redirects: " + urlString);
-          u = new URL(u, response.getHeader("Location"));
-          redirects++;                
-          LOG.fine("redirect to " + u); 
-          
-        } else {                                  // convert to exception
-          throw new HttpError(code);
+      if (code == 200) { // got a good response
+        return new ProtocolOutput(c,
+                new ProtocolStatus(ProtocolStatus.SUCCESS, lastModified)); // return it
+
+      } else if (code == 410) { // page is gone
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + url));
+
+      } else if (code >= 300 && code < 400) { // handle redirect
+        String location = response.getHeader("Location");
+        // some broken servers, such as MS IIS, use lowercase header name...
+        if (location == null) location = response.getHeader("location");
+        if (location == null) location = "";
+        u = new URL(u, location);
+        int protocolStatusCode;
+        switch (code) {
+          case 300:   // multiple choices, preferred value in Location
+            protocolStatusCode = ProtocolStatus.MOVED;
+            break;
+          case 301:   // moved permanently
+          case 305:   // use proxy (Location is URL of proxy)
+            protocolStatusCode = ProtocolStatus.MOVED;
+            break;
+          case 302:   // found (temporarily moved)
+          case 303:   // see other (redirect after POST)
+          case 307:   // temporary redirect
+            protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+            break;
+          case 304:   // not modified
+            protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+            break;
+          default:
+            protocolStatusCode = ProtocolStatus.MOVED;
         }
+        // handle this in the higher layer.
+        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u, lastModified));
+      } else if (code == 400) { // bad request, mark as GONE
+        LOG.fine("400 Bad request: " + u);
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+      } else if (code == 401) { // requires authorization, but no valid auth provided.
+        LOG.fine("401 Authentication Required");
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                + urlString));
+      } else if (code == 404) {
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
+      } else if (code == 410) { // permanently GONE
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+      } else {
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url="
+                + u));
       }
     } catch (Exception e) {
       return new ProtocolOutput(null, new ProtocolStatus(e));
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(revision 359668)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java	(working copy)
@@ -32,6 +32,7 @@
 import java.util.logging.Level;
 
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
 import org.apache.nutch.protocol.ProtocolException;
@@ -123,6 +124,10 @@
       reqStr.append("\r\n");
 
       reqStr.append("Accept-Encoding: x-gzip, gzip\r\n");
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " +
+                HttpDateFormat.toString(datum.getModifiedTime()) + "\r\n");
+      }
 
       if ((Http.AGENT_STRING == null) || (Http.AGENT_STRING.length() == 0)) {
         Http.LOG.severe("User-agent is not set!");
