Index: src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
===================================================================
--- src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java	(revision 0)
+++ src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java	(revision 0)
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.util.Date;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.crawl.CrawlDbUpdateUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Emulate a continuous crawl for one URL. Check every round the resulting
+ * CrawlDatum every round.
+ */
+public class ContinuousCrawlTestUtil extends TestCase {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(ContinuousCrawlTestUtil.class);
+  
+  protected static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected static Configuration defaultConfig = CrawlDBTestUtil
+      .createConfiguration();
+
+  private long interval = FetchSchedule.SECONDS_PER_DAY*1000; // (default) launch crawler every day
+  private long duration = 365L*2*FetchSchedule.SECONDS_PER_DAY*1000L; // run for two years
+
+  protected Configuration configuration;
+  private FetchSchedule schedule;
+  
+  /** status a fetched datum should get */
+  protected byte fetchStatus = CrawlDatum.STATUS_FETCH_SUCCESS;
+  /** expected status of the resulting Db datum */
+  protected byte expectedDbStatus = CrawlDatum.STATUS_DB_FETCHED;
+  
+  protected ContinuousCrawlTestUtil(Configuration conf) {
+    configuration = conf;
+    schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+  }
+  
+  protected ContinuousCrawlTestUtil(Configuration conf, byte fetchStatus, byte expectedDbStatus) {
+    this(conf);
+    this.fetchStatus = fetchStatus;
+    this.expectedDbStatus = expectedDbStatus;
+  }
+  
+  protected ContinuousCrawlTestUtil() {
+    this(defaultConfig);
+  }
+  
+  protected ContinuousCrawlTestUtil(byte fetchStatus, byte expectedDbStatus) {
+    this(defaultConfig, fetchStatus, expectedDbStatus);
+  }
+  
+  protected void setInterval(int seconds) {
+    interval = seconds*1000L;
+  }
+  
+  protected void setDuraction(int seconds) {
+    duration = seconds*1000L;
+  }
+  
+  /** default implementation for */
+  protected boolean checkResult(CrawlDatum result) {
+    if (result.getStatus() != expectedDbStatus)
+      return false;
+    return true;
+  }
+  
+  protected boolean run(boolean haltOnFailure) {
+
+    long now = System.currentTimeMillis();
+
+    CrawlDbUpdateUtil updateDb = new CrawlDbUpdateUtil(configuration);
+          
+    /* start with a db_unfetched */
+    CrawlDatum dbDatum = new CrawlDatum();
+    dbDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    schedule.initializeSchedule(dummyURL, dbDatum); // initialize fetchInterval
+    dbDatum.setFetchTime(now);
+    
+    /* loop emulating a continuous crawl launched in regular intervals:
+     * - emulate a fetch with a page gone returned
+     * - run updatedb and use the resulting CrawlDatum for the next round
+     */
+    LOG.info("Emulate a continuous crawl, launched every "
+        + (interval / (FetchSchedule.SECONDS_PER_DAY * 1000)) + " day");
+    long maxTime = (now + duration);
+    long nextTime = now;
+    long lastFetchTime = -1;
+    boolean ok = true; // record failure but keep going
+    CrawlDatum fetchDatum = new CrawlDatum();
+    /* keep copies because CrawlDbReducer.reduce()
+     * and FetchSchedule.shouldFetch() may alter the references */
+    CrawlDatum copyDbDatum = new CrawlDatum();
+    CrawlDatum copyFetchDatum = new CrawlDatum();
+    CrawlDatum afterShouldFetch = new CrawlDatum();
+    while (nextTime < maxTime) {
+      LOG.info("check: " + new Date(nextTime));
+      fetchDatum.set(dbDatum);
+      copyDbDatum.set(dbDatum);
+      if (schedule.shouldFetch(dummyURL, fetchDatum, nextTime)) {
+        LOG.info("... fetching");
+        if (lastFetchTime > -1) {
+          LOG.info("(last fetch: "
+              + new Date(lastFetchTime)
+              + " : "
+              + ((nextTime - lastFetchTime)
+                  / (FetchSchedule.SECONDS_PER_DAY * 1000))
+              + " days ago)");
+        }
+        lastFetchTime = nextTime;
+        afterShouldFetch.set(fetchDatum);
+        fetchDatum.setStatus(fetchStatus);
+        fetchDatum.setFetchTime(nextTime);
+        copyFetchDatum.set(fetchDatum);
+        List<CrawlDatum> res = updateDb.update(dbDatum, fetchDatum);
+        assertNotNull("null returned", res);
+        assertFalse("no CrawlDatum", 0 == res.size());
+        assertEquals("more than one CrawlDatum", 1, res.size());
+        if (!checkResult(res.get(0))) {
+          LOG.info("CrawlDb: " + copyDbDatum);
+          LOG.info("After shouldFetch(): " + afterShouldFetch);
+          LOG.info("Fetch: " + fetchDatum);
+          LOG.warn("Wrong result (not "
+              + CrawlDatum.getStatusName(expectedDbStatus) + "): "
+              + res.get(0));
+          if (haltOnFailure) {
+            return false;
+          } else {
+            ok = false; // record failure but keep going
+          }
+        }
+        /* use the returned CrawlDatum for the next fetch */
+        dbDatum = res.get(0);
+      }
+      nextTime += interval;
+    }
+    return ok;
+  }
+  
+}
Index: src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
===================================================================
--- src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java	(revision 0)
+++ src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java	(revision 0)
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Counters;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
+ * the old CrawlDatum (db status) and the new one (fetch status)
+ */
+public class CrawlDbUpdateUtil {
+
+  private static final Logger LOG = LoggerFactory.getLogger(CrawlDbUpdateUtil.class);
+  
+  private CrawlDbReducer reducer = new CrawlDbReducer();
+
+  private static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected CrawlDbUpdateUtil(Configuration conf) {
+    reducer.configure(new JobConf(conf));
+  }
+
+  /** {@link OutputCollector} to collect all values in a {@link List} */
+  private class ListOutputCollector implements
+      OutputCollector<Text, CrawlDatum> {
+
+    private List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+
+    public void collect(Text key, CrawlDatum value) throws IOException {
+      values.add(value);
+    }
+
+    /** collected values as list */
+    public List<CrawlDatum> getValues() {
+      return values;
+    }
+
+  }
+
+  /**
+   * Dummy reporter which does nothing and does not return null for
+   * getCounter()
+   * 
+   * @see {@link Reporter#NULL}
+   */
+  private class DummyReporter implements Reporter {
+
+    private Counters dummyCounters = new Counters();
+
+    public void progress() {
+    }
+
+    public Counter getCounter(Enum<?> arg0) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public Counter getCounter(String arg0, String arg1) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public InputSplit getInputSplit() throws UnsupportedOperationException {
+      throw new UnsupportedOperationException("Dummy reporter without input");
+    }
+
+    public void incrCounter(Enum<?> arg0, long arg1) {
+    }
+
+    public void incrCounter(String arg0, String arg1, long arg2) {
+    }
+
+    public void setStatus(String arg0) {
+    }
+
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * @param values  list of input CrawlDatums
+   * @return  list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    if (values == null || values.size() == 0) {
+      return new ArrayList<CrawlDatum>(0);
+    }
+    ListOutputCollector output = new ListOutputCollector();
+    try {
+      reducer.reduce(dummyURL, values.iterator(), output, new DummyReporter());
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return output.getValues();
+  }
+
+  /**
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * @param dbDatum  previous CrawlDatum in CrawlDb
+   * @param fetchDatum  CrawlDatum resulting from fetching
+   * @return  list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum,
+      CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+}
Index: src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestCrawlDbStates.java	(revision 0)
+++ src/test/org/apache/nutch/crawl/TestCrawlDbStates.java	(revision 0)
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import junit.framework.TestCase;
+
+
+/**
+ * Test transitions of {@link CrawlDatum} states during an update of {@link CrawlDb}
+ * (command {@literal updatedb}):
+ * <ul>
+ * <li>call updatedb 
+ * with the old CrawlDatum (db status) and the new one (fetch status) and
+ * test whether the resulting CrawlDatum has the appropriate status.</li>
+ * <li>also check for further CrawlDatum fields (signature, etc.)</li>
+ * <li>and additional conditions
+ * <ul>
+ * <li>retry counters</li>
+ * <li>signatures</li>
+ * <li>configuration properties</li>
+ * </ul>
+ * </li>
+ * </ul>
+ */
+public class TestCrawlDbStates extends TestCase {
+
+	private static final Logger LOG = LoggerFactory.getLogger(TestCrawlDbStates.class);	
+	
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * {@link FetchSchedule#setPageGoneSchedule(Text, CrawlDatum, long, long, long)}
+   * increases the fetchInterval every time it is called. After a gone page
+   * re-fetched several times in a long-running continuous crawl the fetchInterval
+   * becomes larger than fetchIntervalMax which caused the call of
+   * {@link FetchSchedule#forceRefetch(Text, CrawlDatum, boolean)}.
+   * </p>
+   */
+	public void testCrawlDbReducerPageGoneSchedule1() {
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
+        CrawlDatum.STATUS_FETCH_GONE, CrawlDatum.STATUS_DB_GONE);
+    LOG.info("NUTCH-1245: test long running continuous crawl");
+	  if (!crawlUtil.run(false)) {
+	    fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+	  }
+	}
+	
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * As some kind of misconfiguration set db.fetch.interval.default to a value
+   * > (fetchIntervalMax * 1.5).
+   * </p>
+   */
+  public void testCrawlDbReducerPageGoneSchedule2() {
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
+    conf.setInt("db.fetch.interval.default",
+        3 + (int) (fetchIntervalMax * 1.5));
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(conf,
+        CrawlDatum.STATUS_FETCH_GONE, CrawlDatum.STATUS_DB_GONE);
+    LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
+    if (!crawlUtil.run(true)) {
+      fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+    }
+  }
+
+  
+  /**
+   * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max is reached.
+   * Retry counter has to be reset appropriately.
+   */
+  public void testCrawlDbReducerPageRetrySchedule() {
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
+    LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
+    if (!crawlUtil.run(false)) {
+      fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
+    }
+  }
+  
+  private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
+
+    private int retryMax = 3;
+    
+    ContinuousCrawlTestFetchRetry() {
+      super();
+      fetchStatus = CrawlDatum.STATUS_FETCH_RETRY;
+      retryMax = configuration.getInt("db.fetch.retry.max", 3);
+    }
+    
+    @Override
+    protected boolean checkResult(CrawlDatum result) {
+      if (result.getRetriesSinceFetch() > retryMax) {
+        LOG.warn("Retry counter > db.fetch.retry.max");
+      } else if (result.getRetriesSinceFetch() < 0) {
+        LOG.warn("Retry counter overflow");
+      }
+      if (result.getRetriesSinceFetch() <= retryMax) {
+        if (result.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
+          LOG.info("ok: " + result);
+          return true;
+        }
+      } else {
+        if (result.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+          LOG.info("ok: " + result);
+          return true;
+        }
+      }
+      LOG.warn("wrong: " + result);      
+      return false;
+    }
+    
+  }  
+
+	
+}
+
