Index: src/java/org/apache/nutch/storage/WebPage.java
===================================================================
--- src/java/org/apache/nutch/storage/WebPage.java	(revision 1408820)
+++ src/java/org/apache/nutch/storage/WebPage.java	(working copy)
@@ -33,6 +33,8 @@
 import org.apache.gora.persistency.StateManager;
 import org.apache.gora.persistency.impl.PersistentBase;
 import org.apache.gora.persistency.impl.StateManagerImpl;
+import org.apache.gora.persistency.State;
+import org.apache.gora.persistency.StatefulMap;
 import org.apache.gora.persistency.StatefulHashMap;
 import org.apache.gora.persistency.ListGenericArray;
 
@@ -91,11 +93,11 @@
   private ParseStatus parseStatus;
   private float score;
   private Utf8 reprUrl;
-  private Map<Utf8,Utf8> headers;
-  private Map<Utf8,Utf8> outlinks;
-  private Map<Utf8,Utf8> inlinks;
-  private Map<Utf8,Utf8> markers;
-  private Map<Utf8,ByteBuffer> metadata;
+  private StatefulMap<Utf8,Utf8> headers;
+  private StatefulMap<Utf8,Utf8> outlinks;
+  private StatefulMap<Utf8,Utf8> inlinks;
+  private StatefulMap<Utf8,Utf8> markers;
+  private StatefulMap<Utf8,ByteBuffer> metadata;
   public WebPage() {
     this(new StateManagerImpl());
   }
@@ -111,6 +113,15 @@
     return new WebPage(stateManager);
   }
   public Schema getSchema() { return _SCHEMA; }
+  public void setAllDirty() {
+    StateManager st = getStateManager();
+    st.setNew(this);
+    for (Field f : Field.values())
+      if (st.isReadable(this, f.getIndex()))
+        st.setDirty(this, f.getIndex());
+    for (Utf8 key : this.headers.keySet())
+      this.headers.putState(key, State.DIRTY);
+  }
   public Object get(int _field) {
     switch (_field) {
     case 0: return baseUrl;
@@ -160,11 +171,11 @@
     case 14:parseStatus = (ParseStatus)_value; break;
     case 15:score = (Float)_value; break;
     case 16:reprUrl = (Utf8)_value; break;
-    case 17:headers = (Map<Utf8,Utf8>)_value; break;
-    case 18:outlinks = (Map<Utf8,Utf8>)_value; break;
-    case 19:inlinks = (Map<Utf8,Utf8>)_value; break;
-    case 20:markers = (Map<Utf8,Utf8>)_value; break;
-    case 21:metadata = (Map<Utf8,ByteBuffer>)_value; break;
+    case 17:headers = (StatefulMap<Utf8,Utf8>)_value; break;
+    case 18:outlinks = (StatefulMap<Utf8,Utf8>)_value; break;
+    case 19:inlinks = (StatefulMap<Utf8,Utf8>)_value; break;
+    case 20:markers = (StatefulMap<Utf8,Utf8>)_value; break;
+    case 21:metadata = (StatefulMap<Utf8,ByteBuffer>)_value; break;
     default: throw new AvroRuntimeException("Bad index");
     }
   }
Index: src/java/org/apache/nutch/crawl/DbUpdateReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/DbUpdateReducer.java	(revision 1408820)
+++ src/java/org/apache/nutch/crawl/DbUpdateReducer.java	(working copy)
@@ -73,7 +73,9 @@
     for (NutchWritable nutchWritable : values) {
       Writable val = nutchWritable.get();
       if (val instanceof WebPageWritable) {
-        page = ((WebPageWritable) val).getWebPage();
+        WebPage newpage = ((WebPageWritable) val).getWebPage();
+        if (page == null || page.getFetchTime() < newpage.getFetchTime())
+          page = newpage;
       } else {
         inlinkedScoreData.add((ScoreDatum) val);
         if (inlinkedScoreData.size() >= maxLinks) {
Index: src/java/org/apache/nutch/crawl/DbUpdateMapper.java
===================================================================
--- src/java/org/apache/nutch/crawl/DbUpdateMapper.java	(revision 1408820)
+++ src/java/org/apache/nutch/crawl/DbUpdateMapper.java	(working copy)
@@ -17,15 +17,21 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.HashMap;
 import java.util.Map.Entry;
 
 import org.apache.avro.util.Utf8;
 import org.slf4j.Logger;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.scoring.ScoreDatum;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -38,6 +44,11 @@
 extends GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
   public static final Logger LOG = DbUpdaterJob.LOG;
 
+  private URLFilters filters;
+  private URLNormalizers normalizers;
+  private boolean filter;
+  private boolean normalize;
+
   private ScoringFilters scoringFilters;
 
   private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
@@ -47,22 +58,64 @@
   private NutchWritable nutchWritable = new NutchWritable();
   private WebPageWritable pageWritable;
 
+  private String filterUrl(String url) {
+    try {
+      if (normalize) {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_CRAWLDB);
+      }
+      if (filter) {
+        url = filters.filter(url);
+      }
+    } catch (URLFilterException e) {
+      if (GeneratorJob.LOG.isWarnEnabled()) {
+        GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
+        return null;
+      }
+    } catch (MalformedURLException e) {
+      if (GeneratorJob.LOG.isWarnEnabled()) {
+        GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() +")");
+        return null;
+      }
+    }
+    return url;
+  }
+
   @Override
   public void map(String key, WebPage page, Context context)
   throws IOException, InterruptedException {
 
-    String url = TableUtil.unreverseUrl(key);
+    String origurl = TableUtil.unreverseUrl(key);
+    String url = filterUrl(origurl);
+    if (url == null) {
+      DbUpdaterJob.store.delete(key);
+      return;
+    }
+    if (!url.equals(origurl)) {
+      DbUpdaterJob.store.delete(key);
+      key = TableUtil.reverseUrl(url);
+      page.setAllDirty();
+    }
 
     scoreData.clear();
     Map<Utf8, Utf8> outlinks = page.getOutlinks();
     if (outlinks != null) {
+      Map<Utf8, Utf8> newNormalizations = new HashMap<Utf8, Utf8>();
       for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
-                int depth=Integer.MAX_VALUE;
-        Utf8 depthUtf8=page.getFromMarkers(DbUpdaterJob.DISTANCE);
-        if (depthUtf8 != null) depth=Integer.parseInt(depthUtf8.toString());
-        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), 
-            e.getValue().toString(), depth));
+        String to = filterUrl(e.getKey().toString());
+        if (to != null) {
+          int depth=Integer.MAX_VALUE;
+          Utf8 depthUtf8=page.getFromMarkers(DbUpdaterJob.DISTANCE);
+          if (depthUtf8 != null) depth=Integer.parseInt(depthUtf8.toString());
+          scoreData.add(new ScoreDatum(0.0f, to, e.getValue().toString(), depth));
+        }
+        if (to == null || !to.equals(e.getKey().toString())) {
+          page.removeFromOutlinks(e.getKey());
+          if (to != null)
+            newNormalizations.put(new Utf8 (to), e.getValue());
+        }
       }
+      for (Entry<Utf8, Utf8> e : newNormalizations.entrySet())
+        page.putToOutlinks(e.getKey(), e.getValue());
     }
 
     // TODO: Outlink filtering (i.e. "only keep the first n outlinks")
@@ -91,6 +144,15 @@
 
   @Override
   public void setup(Context context) {
+    Configuration conf = context.getConfiguration();
+    filter = conf.getBoolean(DbUpdaterJob.DBUPDATER_FILTER, true);
+    normalize = conf.getBoolean(DbUpdaterJob.DBUPDATER_NORMALIZE, true);
+    if (filter) {
+      filters = new URLFilters(conf);
+    }
+    if (normalize) {
+      normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+    }
     scoringFilters = new ScoringFilters(context.getConfiguration());
     pageWritable = new WebPageWritable(context.getConfiguration(), null);
   }
Index: src/java/org/apache/nutch/crawl/DbUpdaterJob.java
===================================================================
--- src/java/org/apache/nutch/crawl/DbUpdaterJob.java	(revision 1408820)
+++ src/java/org/apache/nutch/crawl/DbUpdaterJob.java	(working copy)
@@ -17,6 +17,7 @@
 package org.apache.nutch.crawl;
 
 import java.util.Collection;
+import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.Map;
 
@@ -35,12 +36,16 @@
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.ToolUtil;
+import org.apache.gora.store.DataStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public class DbUpdaterJob extends NutchTool implements Tool {
 
+  public static final String DBUPDATER_FILTER = "crawldb.url.filters";
+  public static final String DBUPDATER_NORMALIZE = "crawldb.url.normalize";
   public static final Logger LOG = LoggerFactory.getLogger(DbUpdaterJob.class);
+  public static DataStore<String, WebPage> store;
 
 
   private static final Collection<WebPage.Field> FIELDS =
@@ -73,16 +78,26 @@
     
   public Map<String,Object> run(Map<String,Object> args) throws Exception {
     String crawlId = (String)args.get(Nutch.ARG_CRAWL);
+    Boolean filter = (Boolean)args.get(Nutch.ARG_FILTER);
+    Boolean normalize = (Boolean)args.get(Nutch.ARG_NORMALIZE);
     numJobs = 1;
     currentJobNum = 0;
     currentJob = new NutchJob(getConf(), "update-table");
     if (crawlId != null) {
       currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
     }
+    if (filter != null)
+      currentJob.getConfiguration().setBoolean(DBUPDATER_FILTER, filter);
+    if (normalize != null)
+      currentJob.getConfiguration().setBoolean(DBUPDATER_NORMALIZE, normalize);
+    if (normalize != null || filter != null)
+      store = StorageUtils.createWebStore(currentJob .getConfiguration(), String.class, WebPage.class);
     //job.setBoolean(ALL, updateAll);
     ScoringFilters scoringFilters = new ScoringFilters(getConf());
     HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
     fields.addAll(scoringFilters.getFields());
+    if (normalize || filter)
+      fields.addAll(EnumSet.allOf(WebPage.Field.class));
     
     // Partition by {url}, sort by {url,score} and group by {url}.
     // This ensures that the inlinks are sorted by score when they enter
@@ -97,26 +112,37 @@
     StorageUtils.initReducerJob(currentJob, DbUpdateReducer.class);
     currentJob.waitForCompletion(true);
     ToolUtil.recordJobStatus(null, currentJob, results);
+    if (store != null)
+      store.close();
     return results;
   }
   
-  private int updateTable(String crawlId) throws Exception {
+  private int updateTable(String crawlId, boolean filter, boolean normalize) throws Exception {
     LOG.info("DbUpdaterJob: starting");
-    run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId));
+    run(ToolUtil.toArgMap(
+        Nutch.ARG_CRAWL, crawlId,
+        Nutch.ARG_FILTER, filter,
+        Nutch.ARG_NORMALIZE, normalize));
     LOG.info("DbUpdaterJob: done");
     return 0;
   }
 
   public int run(String[] args) throws Exception {
     String crawlId = null;
-    if (args.length == 0) {
-      //
-    } else if (args.length == 2 && "-crawlId".equals(args[0])) {
-      crawlId = args[1];
-    } else {
-      throw new IllegalArgumentException("usage: " + "(-crawlId <id>)");
+    boolean normalize = false;
+    boolean filter = false;
+    for (int i = 0; i < args.length; i++) {
+      if ("-crawlId".equals(args[i])) {
+        crawlId = args[++i];
+      } else if ("-normalize".equals(args[i])) {
+        normalize = true;
+      } else if ("-filter".equals(args[i])) {
+        filter = true;
+      } else {
+        throw new IllegalArgumentException("usage: " + "[-crawlId <id>] [-normalize] [-filter]");
+      }
     }
-    return updateTable(crawlId);
+    return updateTable(crawlId, filter, normalize);
   }
 
   public static void main(String[] args) throws Exception {
