Index: lib/hadoop-0.5.0.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: lib/hadoop-0.7.0.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: lib/hadoop-0.7.0.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Index: src/test/org/apache/nutch/fetcher/TestFetcher.java
===================================================================
--- src/test/org/apache/nutch/fetcher/TestFetcher.java	(revision 449288)
+++ src/test/org/apache/nutch/fetcher/TestFetcher.java	(working copy)
@@ -23,7 +23,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDBTestUtil;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
@@ -108,7 +108,7 @@
     
     READ:
       do {
-      UTF8 key=new UTF8();
+      Text key=new Text();
       Content value=new Content();
       if(!reader.next(key, value)) break READ;
       String contentString=new String(value.getContent());
Index: src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
===================================================================
--- src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java	(revision 0)
+++ src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java	(revision 0)
@@ -0,0 +1,152 @@
+package org.apache.nutch.indexer;
+
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.DateTools.Resolution;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestDeleteDuplicates extends TestCase {
+  Configuration conf;
+  FileSystem fs;
+  Path root;
+  Path index1;
+  Path index2;
+  
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("fs.default.name", "local");
+    fs = FileSystem.get(conf);
+    root = new Path("dedup2-test-" + new Random().nextInt());
+    // create test indexes
+    index1 = createIndex("index1", true, 1.0f, 10L);
+    index2 = createIndex("index2", false, 2.0f, 20L);
+  }
+  
+  private Path createIndex(String name, boolean hashDup, float inc, long time) throws Exception {
+    Path idx = new Path(root, name);
+    Path sub = new Path(idx, "part-0000");
+    Directory dir = FSDirectory.getDirectory(sub.toString(), true);
+    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+    Document doc = makeDoc(name,
+        MD5Hash.digest("1").toString(),
+        "http://www.example.com/1",
+        1.0f, time);
+    writer.addDocument(doc);
+    if (hashDup) {
+      doc = makeDoc(name,
+          MD5Hash.digest("1").toString(),
+          "http://www.example.com/2",
+          1.0f + inc, time + 1);
+    } else {
+      doc = makeDoc(name,
+          MD5Hash.digest("2").toString(),
+          "http://www.example.com/1",
+          1.0f + inc, time + 1);
+    }
+    writer.addDocument(doc);
+    writer.close();
+    return idx;
+  }
+  
+  private Document makeDoc(String segment, String digest, String url, float boost, long time) {
+    Document doc = new Document();
+    doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
+    doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO));
+    doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED));
+    doc.setBoost(boost);
+    doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO));
+    doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO));
+    return doc;
+  }
+  
+  public void tearDown() throws Exception {
+    fs.delete(root);
+  }
+
+  public void testHashDuplicates() throws Exception {
+    DeleteDuplicates dedup = new DeleteDuplicates(conf);
+    dedup.dedup(new Path[]{index1});
+    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals("only one doc left", reader.numDocs(), 1);
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      if (reader.isDeleted(i)) {
+        System.out.println("-doc " + i + " deleted");
+        continue;
+      }
+      Document doc = reader.document(i);
+      // make sure we got the right one
+      assertEquals("check url", "http://www.example.com/2", doc.get("url"));
+      System.out.println(doc);
+    }
+  }
+  
+  public void testUrlDuplicates() throws Exception {
+    DeleteDuplicates dedup = new DeleteDuplicates(conf);
+    dedup.dedup(new Path[]{index2});
+    FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals("only one doc left", reader.numDocs(), 1);
+    MD5Hash hash = MD5Hash.digest("2");
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      if (reader.isDeleted(i)) {
+        System.out.println("-doc " + i + " deleted");
+        continue;
+      }
+      Document doc = reader.document(i);
+      // make sure we got the right one
+      assertEquals("check hash", hash.toString(), doc.get("digest"));
+      System.out.println(doc);
+    }
+  }
+  
+  public void testMixedDuplicates() throws Exception {
+    DeleteDuplicates dedup = new DeleteDuplicates(conf);
+    dedup.dedup(new Path[]{index1, index2});
+    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
+    IndexReader reader = IndexReader.open(dir);
+    assertEquals("only one doc left", reader.numDocs(), 1);
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      if (reader.isDeleted(i)) {
+        System.out.println("-doc " + i + " deleted");
+        continue;
+      }
+      Document doc = reader.document(i);
+      // make sure we got the right one
+      assertEquals("check url", "http://www.example.com/2", doc.get("url"));
+      System.out.println(doc);
+    }
+    reader.close();
+    dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
+    reader = IndexReader.open(dir);
+    assertEquals("only one doc left", reader.numDocs(), 1);
+    MD5Hash hash = MD5Hash.digest("2");
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      if (reader.isDeleted(i)) {
+        System.out.println("-doc " + i + " deleted");
+        continue;
+      }
+      Document doc = reader.document(i);
+      // make sure we got the right one
+      assertEquals("check hash", hash.toString(), doc.get("digest"));
+      System.out.println(doc);
+    }
+    reader.close();
+  }
+  
+}
Index: src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java	(revision 449084)
+++ src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java	(working copy)
@@ -25,7 +25,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.util.NutchConfiguration;
 
 import junit.framework.TestCase;
@@ -61,12 +61,12 @@
     cd1 = new CrawlDatum();
     cd1.setFetchInterval(1.0f);
     cd1.setFetchTime(time);
-    cd1.getMetaData().put(new UTF8("name"), new UTF8("cd1"));
-    cd1.getMetaData().put(new UTF8("cd1"), new UTF8("cd1"));
+    cd1.getMetaData().put(new Text("name"), new Text("cd1"));
+    cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
     cd2 = new CrawlDatum();
     cd2.setFetchInterval(1.0f);
     cd2.setFetchTime(time + 10000);
-    cd2.getMetaData().put(new UTF8("name"), new UTF8("cd2"));
+    cd2.getMetaData().put(new Text("name"), new Text("cd2"));
     cd3 = new CrawlDatum();
     cd3.setFetchInterval(1.0f);
     cd3.setFetchTime(time + 10000);
@@ -125,11 +125,11 @@
   private void createCrawlDb(FileSystem fs, Path crawldb, TreeSet init, CrawlDatum cd) throws Exception {
     LOG.fine("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
-    MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), UTF8.class, CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class);
     Iterator it = init.iterator();
     while (it.hasNext()) {
       String key = (String)it.next();
-      writer.append(new UTF8(key), cd);
+      writer.append(new Text(key), cd);
     }
     writer.close();
   }
Index: src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestLinkDbMerger.java	(revision 449084)
+++ src/test/org/apache/nutch/crawl/TestLinkDbMerger.java	(working copy)
@@ -26,7 +26,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.util.NutchConfiguration;
 
 import junit.framework.TestCase;
@@ -122,7 +122,7 @@
       String url = (String)it.next();
       LOG.fine("url=" + url);
       String[] vals = (String[])expected.get(url);
-      Inlinks inlinks = reader.getInlinks(new UTF8(url));
+      Inlinks inlinks = reader.getInlinks(new Text(url));
       // may not be null
       assertNotNull(inlinks);
       ArrayList links = new ArrayList();
@@ -143,7 +143,7 @@
   private void createLinkDb(FileSystem fs, Path linkdb, TreeMap init) throws Exception {
     LOG.fine("* creating linkdb: " + linkdb);
     Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), UTF8.class, Inlinks.class);
+    MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), Text.class, Inlinks.class);
     Iterator it = init.keySet().iterator();
     while (it.hasNext()) {
       String key = (String)it.next();
@@ -153,7 +153,7 @@
         Inlink in = new Inlink(vals[i], vals[i]);
         inlinks.add(in);
       }
-      writer.append(new UTF8(key), inlinks);
+      writer.append(new Text(key), inlinks);
     }
     writer.close();
   }
Index: src/test/org/apache/nutch/crawl/TestGenerator.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestGenerator.java	(revision 449084)
+++ src/test/org/apache/nutch/crawl/TestGenerator.java	(working copy)
@@ -24,7 +24,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
 
 import junit.framework.TestCase;
@@ -80,7 +80,7 @@
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
     
     for(int i=0;i<=100;i++){
-      list.add(new CrawlDBTestUtil.URLCrawlDatum(new UTF8("http://aaa/" + pad(i)),
+      list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/" + pad(i)),
         new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
     }
     
@@ -105,7 +105,7 @@
     
     READ:
       do {
-      UTF8 key=new UTF8();
+      Text key=new Text();
       CrawlDatum value=new CrawlDatum();
       if(!reader.next(key, value)) break READ;
       l.add(new URLCrawlDatum(key, value));
Index: src/test/org/apache/nutch/crawl/TestMapWritable.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestMapWritable.java	(revision 449084)
+++ src/test/org/apache/nutch/crawl/TestMapWritable.java	(working copy)
@@ -25,7 +25,7 @@
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.MapWritable;
@@ -39,11 +39,11 @@
     MapWritable map = new MapWritable();
     assertTrue(map.isEmpty());
     for (int i = 0; i < 100; i++) {
-      UTF8 key = new UTF8("" + i);
+      Text key = new Text("" + i);
       IntWritable value = new IntWritable(i);
       map.put(key, value);
       assertEquals(i + 1, map.size());
-      assertTrue(map.containsKey(new UTF8("" + i)));
+      assertTrue(map.containsKey(new Text("" + i)));
       assertTrue(map.containsValue(new IntWritable(i)));
       map.remove(key);
       assertEquals(i, map.size());
@@ -64,14 +64,14 @@
     map.clear();
     assertTrue(map.isEmpty());
     assertEquals(0, map.size());
-    assertFalse(map.containsKey(new UTF8("" + 1)));
+    assertFalse(map.containsKey(new Text("" + 1)));
 
   }
 
   public void testWritable() throws Exception {
     MapWritable datum1 = new MapWritable();
     for (int i = 0; i < 100; i++) {
-      datum1.put(new LongWritable(i), new UTF8("" + 1));
+      datum1.put(new LongWritable(i), new Text("" + 1));
     }
     assertEquals(100, datum1.size());
     testWritable(datum1);
@@ -86,7 +86,7 @@
     CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f);
     c.setMetaData(new MapWritable());
     for (int i = 0; i < 100; i++) {
-      c.getMetaData().put(new LongWritable(i), new UTF8("" + 1));
+      c.getMetaData().put(new LongWritable(i), new Text("" + 1));
     }
     testWritable(c);
   }
@@ -94,10 +94,10 @@
   public void testEquals() {
     MapWritable map1 = new MapWritable();
     MapWritable map2 = new MapWritable();
-    map1.put(new UTF8("key1"), new UTF8("val1"));
-    map1.put(new UTF8("key2"), new UTF8("val2"));
-    map2.put(new UTF8("key2"), new UTF8("val2"));
-    map2.put(new UTF8("key1"), new UTF8("val1"));
+    map1.put(new Text("key1"), new Text("val1"));
+    map1.put(new Text("key2"), new Text("val2"));
+    map2.put(new Text("key2"), new Text("val2"));
+    map2.put(new Text("key1"), new Text("val1"));
     assertTrue(map1.equals(map2));
   }
 
@@ -137,13 +137,13 @@
     System.out.println("needed time for reading map's: " + needed);
     fs.delete(file);
 
-    // UTF8
+    // Text
     System.out.println("start writing utf8's");
-    writer = new SequenceFile.Writer(fs, file, IntWritable.class, UTF8.class);
+    writer = new SequenceFile.Writer(fs, file, IntWritable.class, Text.class);
     // write map
     start = System.currentTimeMillis();
     key = new IntWritable();
-    UTF8 value = new UTF8();
+    Text value = new Text();
     String s = "15726:15726";
     for (int i = 0; i < 1000000; i++) {
       key.set(i);
@@ -181,9 +181,9 @@
   }
 
   public void testRecycling() throws Exception {
-    UTF8 value = new UTF8("value");
-    UTF8 key1 = new UTF8("a");
-    UTF8 key2 = new UTF8("b");
+    Text value = new Text("value");
+    Text key1 = new Text("a");
+    Text key2 = new Text("b");
 
     MapWritable writable = new MapWritable();
     writable.put(key1, value);
Index: src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
===================================================================
--- src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java	(revision 449084)
+++ src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java	(working copy)
@@ -28,7 +28,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.mortbay.http.HttpContext;
 import org.mortbay.http.SocketListener;
 import org.mortbay.http.handler.ResourceHandler;
@@ -54,12 +54,12 @@
     LOG.trace("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
     MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000")
-        .toString(), UTF8.class, CrawlDatum.class);
+        .toString(), Text.class, CrawlDatum.class);
     Iterator<URLCrawlDatum> it = init.iterator();
     while (it.hasNext()) {
       URLCrawlDatum row = it.next();
       LOG.info("adding:" + row.url.toString());
-      writer.append(new UTF8(row.url), row.datum);
+      writer.append(new Text(row.url), row.datum);
     }
     writer.close();
   }
@@ -92,11 +92,11 @@
 
   public static class URLCrawlDatum {
 
-    UTF8 url;
+    Text url;
 
     CrawlDatum datum;
 
-    public URLCrawlDatum(UTF8 url, CrawlDatum datum) {
+    public URLCrawlDatum(Text url, CrawlDatum datum) {
       this.url = url;
       this.datum = datum;
     }
Index: src/test/org/apache/nutch/crawl/TestInjector.java
===================================================================
--- src/test/org/apache/nutch/crawl/TestInjector.java	(revision 449088)
+++ src/test/org/apache/nutch/crawl/TestInjector.java	(working copy)
@@ -24,7 +24,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 import junit.framework.TestCase;
 
@@ -111,7 +111,7 @@
     
     READ:
       do {
-      UTF8 key=new UTF8();
+      Text key=new Text();
       CrawlDatum value=new CrawlDatum();
       if(!reader.next(key, value)) break READ;
       read.add(key.toString());
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 449293)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -101,7 +101,7 @@
       synchronized (Fetcher.this) {activeThreads++;} // count threads
       
       try {
-        UTF8 key = new UTF8();
+        Text key = new Text();
         CrawlDatum datum = new CrawlDatum();
         
         while (true) {
@@ -128,7 +128,7 @@
           }
 
           // url may be changed through redirects.
-          UTF8 url = new UTF8();
+          Text url = new Text();
           url.set(key);
           try {
             if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }
@@ -158,7 +158,7 @@
                   newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                   newUrl = this.urlFilters.filter(newUrl);
                   if (newUrl != null && !newUrl.equals(url.toString())) {
-                    url = new UTF8(newUrl);
+                    url = new Text(newUrl);
                     redirecting = true;
                     redirectCount++;
                     if (LOG.isDebugEnabled()) {
@@ -177,7 +177,7 @@
                 newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                 newUrl = this.urlFilters.filter(newUrl);
                 if (newUrl != null && !newUrl.equals(url.toString())) {
-                  url = new UTF8(newUrl);
+                  url = new Text(newUrl);
                   redirecting = true;
                   redirectCount++;
                   if (LOG.isDebugEnabled()) {
@@ -245,7 +245,7 @@
       }
     }
 
-    private void logError(UTF8 url, String message) {
+    private void logError(Text url, String message) {
       if (LOG.isInfoEnabled()) {
         LOG.info("fetch of " + url + " failed with: " + message);
       }
@@ -254,7 +254,7 @@
       }
     }
 
-    private ParseStatus output(UTF8 key, CrawlDatum datum,
+    private ParseStatus output(Text key, CrawlDatum datum,
                         Content content, int status) {
 
       datum.setStatus(status);
@@ -435,14 +435,14 @@
 
     job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
     job.setInputFormat(InputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setMapRunnerClass(Fetcher.class);
 
     job.setOutputPath(segment);
     job.setOutputFormat(FetcherOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(FetcherOutput.class);
 
     JobClient.runJob(job);
Index: src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java	(revision 449084)
+++ src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java	(working copy)
@@ -25,7 +25,7 @@
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 import org.apache.hadoop.mapred.OutputFormat;
 import org.apache.hadoop.mapred.RecordWriter;
@@ -55,7 +55,7 @@
       new Path(new Path(job.getOutputPath(), Content.DIR_NAME), name);
 
     final MapFile.Writer fetchOut =
-      new MapFile.Writer(fs, fetch.toString(), UTF8.class, CrawlDatum.class);
+      new MapFile.Writer(fs, fetch.toString(), Text.class, CrawlDatum.class);
     
     return new RecordWriter() {
         private MapFile.Writer contentOut;
@@ -64,7 +64,7 @@
         {
           if (Fetcher.isStoringContent(job)) {
             contentOut = new MapFile.Writer(fs, content.toString(),
-                                            UTF8.class, Content.class);
+                                            Text.class, Content.class);
           }
 
           if (Fetcher.isParsing(job)) {
Index: src/java/org/apache/nutch/metadata/Metadata.java
===================================================================
--- src/java/org/apache/nutch/metadata/Metadata.java	(revision 449084)
+++ src/java/org/apache/nutch/metadata/Metadata.java	(working copy)
@@ -34,7 +34,7 @@
 import org.apache.commons.lang.StringUtils;
 
 // Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
 
@@ -305,11 +305,11 @@
     String[] values = null;
     String[] names = names();
     for (int i=0; i<names.length; i++) {
-      UTF8.writeString(out, names[i]);
+      Text.writeString(out, names[i]);
       values = getValues(names[i]);
       out.writeInt(values.length);
       for (int j=0; j<values.length; j++) {
-        UTF8.writeString(out, values[j]);
+        Text.writeString(out, values[j]);
       }
     }
   }
@@ -319,10 +319,10 @@
     int keySize = in.readInt();
     String key;
     for (int i=0; i<keySize; i++) {
-      key = UTF8.readString(in);
+      key = Text.readString(in);
       int valueSize = in.readInt();
       for (int j=0; j<valueSize; j++) {
-        add(key, UTF8.readString(in));
+        add(key, Text.readString(in));
       }
     }
   }
Index: src/java/org/apache/nutch/searcher/Hits.java
===================================================================
--- src/java/org/apache/nutch/searcher/Hits.java	(revision 449084)
+++ src/java/org/apache/nutch/searcher/Hits.java	(working copy)
@@ -22,7 +22,7 @@
 
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 /** A set of hits matching a query. */
 public final class Hits implements Writable {
@@ -69,13 +69,13 @@
     out.writeLong(total);                         // write total hits
     out.writeInt(top.length);                     // write hits returned
     if (top.length > 0)                           // write sort value class
-      UTF8.writeString(out, top[0].getSortValue().getClass().getName());
+      Text.writeString(out, top[0].getSortValue().getClass().getName());
                       
     for (int i = 0; i < top.length; i++) {
       Hit h = top[i];
       out.writeInt(h.getIndexDocNo());            // write indexDocNo
       h.getSortValue().write(out);                // write sortValue
-      UTF8.writeString(out, h.getDedupValue());   // write dedupValue
+      Text.writeString(out, h.getDedupValue());   // write dedupValue
     }
   }
 
@@ -85,7 +85,7 @@
     Class sortClass = null;
     if (top.length > 0) {                         // read sort value class
       try {
-        sortClass = Class.forName(UTF8.readString(in));
+        sortClass = Class.forName(Text.readString(in));
       } catch (ClassNotFoundException e) {
         throw new IOException(e.toString());
       }
@@ -102,7 +102,7 @@
       }
       sortValue.readFields(in);                   // read sortValue
 
-      String dedupValue = UTF8.readString(in);    // read dedupValue
+      String dedupValue = Text.readString(in);    // read dedupValue
 
       top[i] = new Hit(indexDocNo, sortValue, dedupValue);
     }
Index: src/java/org/apache/nutch/searcher/FetchedSegments.java
===================================================================
--- src/java/org/apache/nutch/searcher/FetchedSegments.java	(revision 449084)
+++ src/java/org/apache/nutch/searcher/FetchedSegments.java	(working copy)
@@ -53,7 +53,7 @@
       this.conf = conf;
     }
 
-    public CrawlDatum getCrawlDatum(UTF8 url) throws IOException {
+    public CrawlDatum getCrawlDatum(Text url) throws IOException {
       synchronized (this) {
         if (crawl == null)
           crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
@@ -61,7 +61,7 @@
       return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
     }
     
-    public byte[] getContent(UTF8 url) throws IOException {
+    public byte[] getContent(Text url) throws IOException {
       synchronized (this) {
         if (content == null)
           content = getReaders(Content.DIR_NAME);
@@ -69,7 +69,7 @@
       return ((Content)getEntry(content, url, new Content())).getContent();
     }
 
-    public ParseData getParseData(UTF8 url) throws IOException {
+    public ParseData getParseData(Text url) throws IOException {
       synchronized (this) {
         if (parseData == null)
           parseData = getReaders(ParseData.DIR_NAME);
@@ -77,7 +77,7 @@
       return (ParseData)getEntry(parseData, url, new ParseData());
     }
 
-    public ParseText getParseText(UTF8 url) throws IOException {
+    public ParseText getParseText(Text url) throws IOException {
       synchronized (this) {
         if (parseText == null)
           parseText = getReaders(ParseText.DIR_NAME);
@@ -89,7 +89,7 @@
       return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf);
     }
 
-    private Writable getEntry(MapFile.Reader[] readers, UTF8 url,
+    private Writable getEntry(MapFile.Reader[] readers, Text url,
                               Writable entry) throws IOException {
       return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
     }
@@ -212,8 +212,8 @@
     return (Segment)segments.get(details.getValue("segment"));
   }
 
-  private UTF8 getUrl(HitDetails details) {
-    return new UTF8(details.getValue("url"));
+  private Text getUrl(HitDetails details) {
+    return new Text(details.getValue("url"));
   }
 
   public void close() throws IOException {
Index: src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- src/java/org/apache/nutch/searcher/IndexSearcher.java	(revision 449084)
+++ src/java/org/apache/nutch/searcher/IndexSearcher.java	(working copy)
@@ -152,7 +152,7 @@
         } else if (raw instanceof Float) {
           sortValue = new FloatWritable(((Float)raw).floatValue());
         } else if (raw instanceof String) {
-          sortValue = new UTF8((String)raw);
+          sortValue = new Text((String)raw);
         } else {
           throw new RuntimeException("Unknown sort value type!");
         }
Index: src/java/org/apache/nutch/searcher/LinkDbInlinks.java
===================================================================
--- src/java/org/apache/nutch/searcher/LinkDbInlinks.java	(revision 449084)
+++ src/java/org/apache/nutch/searcher/LinkDbInlinks.java	(working copy)
@@ -12,7 +12,7 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
@@ -30,11 +30,11 @@
   }
 
   public String[] getAnchors(HitDetails details) throws IOException {
-    return linkdb.getAnchors(new UTF8(details.getValue("url")));
+    return linkdb.getAnchors(new Text(details.getValue("url")));
   }
 
   public Inlinks getInlinks(HitDetails details) throws IOException {
-    return linkdb.getInlinks(new UTF8(details.getValue("url")));
+    return linkdb.getInlinks(new Text(details.getValue("url")));
   }
 
   public void close() throws IOException {
Index: src/java/org/apache/nutch/searcher/Summary.java
===================================================================
--- src/java/org/apache/nutch/searcher/Summary.java	(revision 449084)
+++ src/java/org/apache/nutch/searcher/Summary.java	(working copy)
@@ -23,7 +23,7 @@
 import java.util.ArrayList;
 
 // Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
 // Nutch imports
@@ -187,12 +187,12 @@
       fragment = (Fragment) fragments.get(i);
       if (fragment.isHighlight()) {
         out.writeByte(HIGHLIGHT);
-        UTF8.writeString(out, fragment.getText());
+        Text.writeString(out, fragment.getText());
       } else if (fragment.isEllipsis()) {
         out.writeByte(ELLIPSIS);
       } else {
         out.writeByte(FRAGMENT);
-        UTF8.writeString(out, fragment.getText());
+        Text.writeString(out, fragment.getText());
       }
     }
   }
@@ -204,11 +204,11 @@
     for (int i=0; i<nbFragments; i++) {
       int type = in.readByte();
       if (type == HIGHLIGHT) {
-        fragment = new Highlight(UTF8.readString(in));
+        fragment = new Highlight(Text.readString(in));
       } else if (type == ELLIPSIS) {
         fragment = new Ellipsis();
       } else {
-        fragment = new Fragment(UTF8.readString(in));
+        fragment = new Fragment(Text.readString(in));
       }
       fragments.add(fragment);
     }
Index: src/java/org/apache/nutch/indexer/DeleteDuplicates.java
===================================================================
--- src/java/org/apache/nutch/indexer/DeleteDuplicates.java	(revision 449084)
+++ src/java/org/apache/nutch/indexer/DeleteDuplicates.java	(working copy)
@@ -17,6 +17,7 @@
 package org.apache.nutch.indexer;
 
 import java.io.*;
+import java.text.SimpleDateFormat;
 import java.util.*;
 
 import org.apache.commons.logging.Log;
@@ -34,102 +35,111 @@
 import org.apache.nutch.util.ToolBase;
 
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 
-/******************************************************************
- * Deletes duplicate documents in a set of Lucene indexes.
+/**
+ * Delete duplicate documents in a set of Lucene indexes.
  * Duplicates have either the same contents (via MD5 hash) or the same URL.
- ******************************************************************/
+ * 
+ * This tool uses the following algorithm:
+ * 
+ * <ul>
+ * <li><b>Phase 1 - remove URL duplicates:</b><br/>
+ * In this phase documents with the same URL
+ * are compared, and only the most recent document is retained -
+ * all other URL duplicates are scheduled for deletion.</li>
+ * <li><b>Phase 2 - remove content duplicates:</b><br/>
+ * In this phase documents with the same content hash are compared. If
+ * property "dedup.keep.highest.score" is set to true (default) then only
+ * the document with the highest score is retained. If this property is set
+ * to false, only the document with the shortest URL is retained - all other
+ * content duplicates are scheduled for deletion.</li>
+ * <li><b>Phase 3 - delete documents:</b><br/>
+ * In this phase documents scheduled for deletion are deleted from
+ * Lucene index(es).</li>
+ * </ul>
+ * 
+ * @author Andrzej Bialecki
+ */
 public class DeleteDuplicates extends ToolBase
   implements Mapper, Reducer, OutputFormat {
   private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
 
 //   Algorithm:
 //      
-//   1. map indexes -> <<md5, score, urlLen>, <index,doc>>
+//   1. map indexes -> <url, <md5, url, time, urlLen, index,doc>>
+//      reduce, deleting all but most recent
+//
+//   2. map indexes -> <md5, <md5, url, time, urlLen, index,doc>>
 //      partition by md5
-//      reduce, deleting all but largest score w/ shortest url
-//
-//   2. map indexes -> <<url, fetchdate>, <index,doc>>
-//      partition by url
-//      reduce, deleting all but most recent.
-//
-//   Part 2 is not yet implemented, but the Indexer currently only indexes one
-//   URL per page, so this is not a critical problem.
+//      reduce, deleting all but with highest score (or shortest url).
 
   public static class IndexDoc implements WritableComparable {
-    private UTF8 index;                           // the segment index
+    private Text url = new Text();
+    private int urlLen;
+    private float score;
+    private long time;
+    private MD5Hash hash = new MD5Hash();
+    private Text index = new Text();              // the segment index
     private int doc;                              // within the index
+    private boolean keep = true;                  // keep or discard
 
+    public String toString() {
+      return "[url=" + url + ",score=" + score + ",time=" + time
+        + ",hash=" + hash + ",index=" + index + ",doc=" + doc
+        + ",keep=" + keep + "]";
+    }
+    
     public void write(DataOutput out) throws IOException {
+      url.write(out);
+      out.writeFloat(score);
+      out.writeLong(time);
+      hash.write(out);
       index.write(out);
       out.writeInt(doc);
+      out.writeBoolean(keep);
     }
 
     public void readFields(DataInput in) throws IOException {
-      if (index == null) {
-        index = new UTF8();
-      }
+      url.readFields(in);
+      urlLen = url.getLength();
+      score = in.readFloat();
+      time = in.readLong();
+      hash.readFields(in);
       index.readFields(in);
-      this.doc = in.readInt();
+      doc = in.readInt();
+      keep = in.readBoolean();
     }
 
     public int compareTo(Object o) {
       IndexDoc that = (IndexDoc)o;
-      int indexCompare = this.index.compareTo(that.index);
-      if (indexCompare != 0) {                    // prefer later indexes
-        return indexCompare;
+      if (this.keep != that.keep) {
+        return this.keep ? 1 : -1; 
+      } else if (!this.hash.equals(that.hash)) {       // order first by hash
+        return this.hash.compareTo(that.hash);
+      } else if (this.time != that.time) {      // prefer more recent docs
+        return this.time > that.time ? 1 : -1 ;
+      } else if (this.urlLen != this.urlLen) {  // prefer shorter urls
+        return this.urlLen - that.urlLen;
       } else {
-        return this.doc - that.doc;               // prefer later docs
+        return this.score > that.score ? 1 : -1;
       }
     }
 
     public boolean equals(Object o) {
       IndexDoc that = (IndexDoc)o;
-      return this.index.equals(that.index) && this.doc == that.doc;
+      return this.keep == that.keep
+        && this.hash.equals(that.hash)
+        && this.time == that.time
+        && this.score == that.score
+        && this.urlLen == that.urlLen
+        && this.index.equals(that.index) 
+        && this.doc == that.doc;
     }
 
   }
 
-  public static class HashScore implements WritableComparable {
-    private MD5Hash hash;
-    private float score;
-    private int urlLen;
-
-    public void write(DataOutput out) throws IOException {
-      hash.write(out);
-      out.writeFloat(score);
-      out.writeInt(urlLen);
-    }
-
-    public void readFields(DataInput in) throws IOException {
-      if (hash == null) {
-        hash = new MD5Hash();
-      }
-      hash.readFields(in);
-      score = in.readFloat();
-      urlLen = in.readInt();
-    }
-
-    public int compareTo(Object o) {
-      HashScore that = (HashScore)o;
-      if (!this.hash.equals(that.hash)) {         // order first by hash
-        return this.hash.compareTo(that.hash);
-      } else if (this.score != that.score) {      // prefer larger scores
-        return this.score < that.score ? 1 : -1 ;
-      } else {                                    // prefer shorter urls
-        return this.urlLen - that.urlLen;
-      }
-    }
-
-    public boolean equals(Object o) {
-      HashScore that = (HashScore)o;
-      return this.hash.equals(that.hash)
-        && this.score == that.score
-        && this.urlLen == that.urlLen;
-    }
-  }
-
   public static class InputFormat extends InputFormatBase {
     private static final long INDEX_LENGTH = Integer.MAX_VALUE;
 
@@ -145,94 +155,177 @@
       return splits;
     }
 
-    /** Return each index as a split. */
-    public RecordReader getRecordReader(final FileSystem fs,
-                                        final FileSplit split,
-                                        final JobConf job,
-                                        Reporter reporter) throws IOException {
-      final UTF8 index = new UTF8(split.getPath().toString());
-      reporter.setStatus(index.toString());
-      return new RecordReader() {
+    public class DDRecordReader implements RecordReader {
 
-          private IndexReader indexReader =
-            IndexReader.open(new FsDirectory(fs, split.getPath(), false, job));
+      private IndexReader indexReader;
+      private int maxDoc;
+      private int doc;
+      private Text index;
+      
+      public DDRecordReader(FileSystem fs, FileSplit split, JobConf job,
+          Text index) throws IOException {
+        indexReader = IndexReader.open(new FsDirectory(fs, split.getPath(), false, job));
+        indexReader.undeleteAll();
+        maxDoc = indexReader.maxDoc();
+        this.index = index;
+      }
 
-          { indexReader.undeleteAll(); }
+      public boolean next(Writable key, Writable value)
+        throws IOException {
 
-          private final int maxDoc = indexReader.maxDoc();
-          private int doc;
+        if (doc >= maxDoc)
+          return false;
 
-          public boolean next(Writable key, Writable value)
-            throws IOException {
+        Document document = indexReader.document(doc);
 
-            if (doc >= maxDoc)
-              return false;
+        // fill in key
+        ((Text)key).set(document.get("url"));
+        // fill in value
+        IndexDoc indexDoc = (IndexDoc)value;
+        indexDoc.keep = true;
+        indexDoc.url.set(document.get("url"));
+        indexDoc.hash.setDigest(document.get("digest"));
+        indexDoc.score = Float.parseFloat(document.get("boost"));
+        try {
+          indexDoc.time = DateTools.stringToTime(document.get("tstamp"));
+        } catch (Exception e) {
+          // try to figure out the time from segment name
+          try {
+            String segname = document.get("segment");
+            indexDoc.time = new SimpleDateFormat("yyyyMMddHHmmss").parse(segname).getTime();
+            // make it unique
+            indexDoc.time += doc;
+          } catch (Exception e1) {
+            // use current time
+            indexDoc.time = System.currentTimeMillis();
+          }
+        }
+        indexDoc.index = index;
+        indexDoc.doc = doc;
 
-            Document document = indexReader.document(doc);
+        doc++;
 
-            // fill in key
-            if (key instanceof UTF8) {
-              ((UTF8)key).set(document.get("url"));
-            } else {
-              HashScore hashScore = (HashScore)key;
-              if (hashScore.hash == null) {
-                hashScore.hash = new MD5Hash();
-              }
-              hashScore.hash.setDigest(document.get("digest"));
-              hashScore.score = Float.parseFloat(document.get("boost"));
-              hashScore.urlLen = document.get("url").length();
-            }
+        return true;
+      }
 
-            // fill in value
-            IndexDoc indexDoc = (IndexDoc)value;
-            if (indexDoc.index == null) {
-              indexDoc.index = new UTF8();
-            }
-            indexDoc.index.set(index);
-            indexDoc.doc = doc;
+      public long getPos() throws IOException {
+        return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
+      }
 
-            doc++;
-
-            return true;
-          }
-
-          public long getPos() throws IOException {
-            return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
-          }
-
-          public void close() throws IOException {
-            indexReader.close();
-          }
-        };
+      public void close() throws IOException {
+        indexReader.close();
+      }
+      
+      public WritableComparable createKey() {
+        return new Text();
+      }
+      
+      public Writable createValue() {
+        return new IndexDoc();
+      }
     }
+    
+    /** Return each index as a split. */
+    public RecordReader getRecordReader(final FileSystem fs,
+                                        final FileSplit split,
+                                        final JobConf job,
+                                        Reporter reporter) throws IOException {
+      final Text index = new Text(split.getPath().toString());
+      reporter.setStatus(index.toString());
+      return new DDRecordReader(fs, split, job, index);
+    }
   }
-
+  
   public static class HashPartitioner implements Partitioner {
     public void configure(JobConf job) {}
     public void close() {}
     public int getPartition(WritableComparable key, Writable value,
                             int numReduceTasks) {
-      int hashCode = ((HashScore)key).hash.hashCode();
+      int hashCode = ((MD5Hash)key).hashCode();
       return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
     }
   }
 
-  public static class HashReducer implements Reducer {
-    private MD5Hash prevHash = new MD5Hash();
+  public static class UrlsReducer implements Reducer {
+    
     public void configure(JobConf job) {}
+    
     public void close() {}
+    
     public void reduce(WritableComparable key, Iterator values,
+        OutputCollector output, Reporter reporter) throws IOException {
+      IndexDoc latest = null;
+      while (values.hasNext()) {
+        IndexDoc value = (IndexDoc)values.next();
+        if (latest == null) {
+          latest = value;
+          continue;
+        }
+        if (value.time > latest.time) {
+          // discard current and use more recent
+          latest.keep = false;
+          LOG.debug("-discard " + latest + ", keep " + value);
+          output.collect(latest.hash, latest);
+          latest = value;
+        } else {
+          // discard
+          value.keep = false;
+          LOG.debug("-discard " + value + ", keep " + latest);
+          output.collect(value.hash, value);
+        }
+        
+      }
+      // keep the latest
+      latest.keep = true;
+      output.collect(latest.hash, latest);
+      
+    }
+  }
+  
+  public static class HashReducer implements Reducer {
+    boolean byScore;
+    
+    public void configure(JobConf job) {
+      byScore = job.getBoolean("dedup.keep.highest.score", true);
+    }
+    
+    public void close() {}
+    public void reduce(WritableComparable key, Iterator values,
                        OutputCollector output, Reporter reporter)
       throws IOException {
-      MD5Hash hash = ((HashScore)key).hash;
+      IndexDoc highest = null;
       while (values.hasNext()) {
-        Writable value = (Writable)values.next();
-        if (hash.equals(prevHash)) {                // collect all but first
-          output.collect(key, value);
+        IndexDoc value = (IndexDoc)values.next();
+        // skip already deleted
+        if (!value.keep) {
+          LOG.debug("-discard " + value + " (already marked)");
+          output.collect(value.url, value);
+          continue;
+        }
+        if (highest == null) {
+          highest = value;
+          continue;
+        }
+        if (byScore) {
+          if (value.score > highest.score) {
+            highest.keep = false;
+            LOG.debug("-discard " + highest + ", keep " + value);
+            output.collect(highest.url, highest);     // delete highest
+            highest = value;
+          }
         } else {
-          prevHash.set(hash);
+          if (value.urlLen < highest.urlLen) {
+            highest.keep = false;
+            LOG.debug("-discard " + highest + ", keep " + value);
+            output.collect(highest.url, highest);     // delete highest
+            highest = value;
+          }
         }
       }
+      LOG.debug("-keep " + highest);
+      // no need to add this - in phase 2 we only process docs to delete them
+      // highest.keep = true;
+      // output.collect(key, highest);
     }
   }
     
@@ -240,8 +333,12 @@
 
   public void configure(JobConf job) {
     setConf(job);
+  }
+  
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
     try {
-      fs = FileSystem.get(job);
+      fs = FileSystem.get(conf);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -254,6 +351,9 @@
                   OutputCollector output, Reporter reporter)
     throws IOException {
     IndexDoc indexDoc = (IndexDoc)value;
+    // don't delete these
+    if (indexDoc.keep) return;
+    // delete all others
     output.collect(indexDoc.index, new IntWritable(indexDoc.doc));
   }
 
@@ -265,7 +365,9 @@
     IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
     try {
       while (values.hasNext()) {
-        reader.deleteDocument(((IntWritable)values.next()).get());
+        IntWritable value = (IntWritable)values.next();
+        LOG.debug("-delete " + index + " doc=" + value);
+        reader.deleteDocument(value.get());
       }
     } finally {
       reader.close();
@@ -301,8 +403,8 @@
 
     if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }
 
-    Path hashDir =
-      new Path("dedup-hash-"+
+    Path outDir1 =
+      new Path("dedup-urls-"+
                Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
     JobConf job = new NutchJob(getConf());
@@ -313,44 +415,67 @@
       }
       job.addInputPath(indexDirs[i]);
     }
-    job.setJobName("dedup phase 1");
+    job.setJobName("dedup 1: urls by time");
 
-    job.setInputKeyClass(HashScore.class);
-    job.setInputValueClass(IndexDoc.class);
     job.setInputFormat(InputFormat.class);
-    job.setBoolean("mapred.speculative.execution", false);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(IndexDoc.class);
 
+    job.setReducerClass(UrlsReducer.class);
+    job.setOutputPath(outDir1);
+
+    job.setOutputKeyClass(MD5Hash.class);
+    job.setOutputValueClass(IndexDoc.class);
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+
+    JobClient.runJob(job);
+
+    Path outDir2 =
+      new Path("dedup-hash-"+
+               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+    job = new NutchJob(getConf());
+    job.setJobName("dedup 2: content by hash");
+
+    job.addInputPath(outDir1);
+    //job.setInputKeyClass(MD5Hash.class);
+    //job.setInputValueClass(IndexDoc.class);
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapOutputKeyClass(MD5Hash.class);
+    job.setMapOutputValueClass(IndexDoc.class);
     job.setPartitionerClass(HashPartitioner.class);
+    job.setSpeculativeExecution(false);
+    
     job.setReducerClass(HashReducer.class);
+    job.setOutputPath(outDir2);
 
-    job.setOutputPath(hashDir);
-
-    job.setOutputKeyClass(HashScore.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(IndexDoc.class);
     job.setOutputFormat(SequenceFileOutputFormat.class);
 
     JobClient.runJob(job);
 
+    // remove outDir1 - no longer needed
+    fs.delete(outDir1);
+    
     job = new NutchJob(getConf());
-    job.setJobName("dedup phase 2");
+    job.setJobName("dedup 3: delete from index(es)");
 
-    job.addInputPath(hashDir);
-
+    job.addInputPath(outDir2);
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(HashScore.class);
-    job.setInputValueClass(IndexDoc.class);
+    //job.setInputKeyClass(Text.class);
+    //job.setInputValueClass(IndexDoc.class);
 
     job.setInt("io.file.buffer.size", 4096);
     job.setMapperClass(DeleteDuplicates.class);
     job.setReducerClass(DeleteDuplicates.class);
 
     job.setOutputFormat(DeleteDuplicates.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(IntWritable.class);
 
     JobClient.runJob(job);
 
-    new JobClient(getConf()).getFs().delete(hashDir);
+    fs.delete(outDir2);
 
     if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); }
   }
@@ -363,7 +488,7 @@
   public int run(String[] args) throws Exception {
     
     if (args.length < 1) {
-      System.err.println("Usage: <indexes> ...");
+      System.err.println("Usage: DeleteDuplicates <indexes> ...");
       return -1;
     }
     
Index: src/java/org/apache/nutch/indexer/IndexingFilter.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilter.java	(revision 449084)
+++ src/java/org/apache/nutch/indexer/IndexingFilter.java	(working copy)
@@ -21,7 +21,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 // Nutch imports
 import org.apache.nutch.parse.Parse;
@@ -50,6 +50,6 @@
    * @return modified (or a new) document instance
    * @throws IndexingException
    */
-  Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException;
 }
Index: src/java/org/apache/nutch/indexer/Indexer.java
===================================================================
--- src/java/org/apache/nutch/indexer/Indexer.java	(revision 449084)
+++ src/java/org/apache/nutch/indexer/Indexer.java	(working copy)
@@ -233,7 +233,7 @@
     Parse parse = new ParseImpl(parseText, parseData);
     try {
       // run indexing filters
-      doc = this.filters.filter(doc, parse, (UTF8)key, fetchDatum, inlinks);
+      doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
     } catch (IndexingException e) {
       if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
       return;
@@ -242,7 +242,7 @@
     float boost = 1.0f;
     // run scoring filters
     try {
-      boost = this.scfilters.indexerScore((UTF8)key, doc, dbDatum,
+      boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
               fetchDatum, parse, inlinks, boost);
     } catch (ScoringFilterException e) {
       if (LOG.isWarnEnabled()) {
@@ -283,7 +283,7 @@
     job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
 
     job.setInputFormat(InputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(ObjectWritable.class);
 
     //job.setCombinerClass(Indexer.class);
@@ -291,7 +291,7 @@
 
     job.setOutputPath(indexDir);
     job.setOutputFormat(OutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(ObjectWritable.class);
 
     JobClient.runJob(job);
Index: src/java/org/apache/nutch/indexer/IndexingFilters.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilters.java	(revision 449084)
+++ src/java/org/apache/nutch/indexer/IndexingFilters.java	(working copy)
@@ -29,7 +29,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 /** Creates and caches {@link IndexingFilter} implementing plugins.*/
 public class IndexingFilters {
@@ -66,7 +66,7 @@
   }                  
 
   /** Run all defined filters. */
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
     for (int i = 0; i < this.indexingFilters.length; i++) {
       doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
Index: src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
===================================================================
--- src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java	(revision 0)
+++ src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java	(revision 0)
@@ -0,0 +1,115 @@
+package org.apache.nutch.tools.compat;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.MapWritable;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+public class CrawlDbConverter extends ToolBase implements Mapper {
+  private static final Log LOG = LogFactory.getLog(CrawlDbConverter.class);
+  
+  private static final String CONVERT_META_KEY = "db.converter.with.metadata";
+
+  private boolean withMetadata;
+  private Text newKey;
+  
+  public void configure(JobConf job) {
+    setConf(job);
+    withMetadata = job.getBoolean(CONVERT_META_KEY, false);
+    newKey = new Text();
+  }
+
+  public void map(WritableComparable key, Writable value, OutputCollector output,
+      Reporter reporter) throws IOException {
+    newKey.set(key.toString());
+    if (withMetadata) {
+      CrawlDatum datum = (CrawlDatum)value;
+      MapWritable meta = datum.getMetaData();
+      if (meta.size() > 0) {
+        MapWritable newMeta = new MapWritable();
+        Iterator it = meta.keySet().iterator();
+        while (it.hasNext()) {
+          WritableComparable k = (WritableComparable)it.next();
+          Writable v = meta.get(k);
+          if (k instanceof UTF8) {
+            Text t = new Text(k.toString());
+            k = t;
+          }
+          newMeta.put(k, v);
+        }
+        datum.setMetaData(newMeta);
+      }
+    }
+    output.collect(newKey, value);
+  }
+
+  public void close() throws IOException {
+  }
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    int res = new CrawlDbConverter().doMain(NutchConfiguration.create(), args);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length == 0) {
+      System.err.println("Usage: CrawlDbConverter <oldDb> <newDb> [-withMetadata]");
+      System.err.println("\toldDb\tname of the crawldb that uses UTF8 class.");
+      System.err.println("\tnewDb\tname of the crawldb that will use Text class.");
+      System.err.println("\twithMetadata\tconvert also all metadata keys using UTF8 to Text.");
+      return -1;
+    }
+    JobConf job = new NutchJob(getConf());
+    FileSystem fs = FileSystem.get(getConf());
+    Path oldDb = new Path(args[0], CrawlDatum.DB_DIR_NAME);
+    Path newDb =
+      new Path(oldDb,
+               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+    if (!fs.exists(oldDb)) {
+      LOG.fatal("Old db doesn't exist in '" + args[0] + "'");
+      return -1;
+    }
+    boolean withMetadata = false;
+    if (args.length > 2 && args[2].equalsIgnoreCase("-withMetadata"))
+      withMetadata = true;
+    
+    job.setBoolean(CONVERT_META_KEY, withMetadata);
+    job.setInputPath(oldDb);
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(CrawlDbConverter.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    job.setOutputPath(newDb);
+    try {
+      JobClient.runJob(job);
+      CrawlDb.install(job, new Path(args[1]));
+      return 0;
+    } catch (Exception e) {
+      LOG.fatal("Error: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}

Property changes on: src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/java/org/apache/nutch/tools/DmozParser.java
===================================================================
--- src/java/org/apache/nutch/tools/DmozParser.java	(revision 449084)
+++ src/java/org/apache/nutch/tools/DmozParser.java	(working copy)
@@ -291,8 +291,8 @@
     if (LOG.isInfoEnabled()) { LOG.info("skew = " + rp.hashSkew); }
 
     //
-    // Open filtered text stream.  The UTF8Filter makes sure that
-    // only appropriate XML-approved UTF8 characters are received.
+    // Open filtered text stream.  The TextFilter makes sure that
+    // only appropriate XML-approved Text characters are received.
     // Any non-conforming characters are silently skipped.
     //
     XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
Index: src/java/org/apache/nutch/protocol/Protocol.java
===================================================================
--- src/java/org/apache/nutch/protocol/Protocol.java	(revision 449084)
+++ src/java/org/apache/nutch/protocol/Protocol.java	(working copy)
@@ -18,7 +18,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
@@ -32,5 +32,5 @@
 
   /** Returns the {@link Content} for a fetchlist entry.
    */
-  ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum);
+  ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
 }
Index: src/java/org/apache/nutch/protocol/Content.java
===================================================================
--- src/java/org/apache/nutch/protocol/Content.java	(revision 449084)
+++ src/java/org/apache/nutch/protocol/Content.java	(working copy)
@@ -67,13 +67,13 @@
     if (version > VERSION)
       throw new VersionMismatchException(VERSION, version);
 
-    url = UTF8.readString(in);                    // read url
-    base = UTF8.readString(in);                   // read base
+    url = Text.readString(in);                    // read url
+    base = Text.readString(in);                   // read base
 
     content = new byte[in.readInt()];             // read content
     in.readFully(content);
 
-    contentType = UTF8.readString(in);            // read contentType
+    contentType = Text.readString(in);            // read contentType
 
     metadata = new Metadata();
     metadata.readFields(in);                    // read meta data
@@ -82,13 +82,13 @@
   protected final void writeCompressed(DataOutput out) throws IOException {
     out.writeByte(version);
 
-    UTF8.writeString(out, url);                   // write url
-    UTF8.writeString(out, base);                  // write base
+    Text.writeString(out, url);                   // write url
+    Text.writeString(out, base);                  // write base
 
     out.writeInt(content.length);                 // write content
     out.write(content);
 
-    UTF8.writeString(out, contentType);           // write contentType
+    Text.writeString(out, contentType);           // write contentType
     
     metadata.write(out);                           // write metadata
   }
Index: src/java/org/apache/nutch/segment/SegmentMerger.java
===================================================================
--- src/java/org/apache/nutch/segment/SegmentMerger.java	(revision 449084)
+++ src/java/org/apache/nutch/segment/SegmentMerger.java	(working copy)
@@ -89,10 +89,10 @@
 public class SegmentMerger extends Configured implements Mapper, Reducer {
   private static final Log LOG = LogFactory.getLog(SegmentMerger.class);
 
-  private static final UTF8 SEGMENT_PART_KEY = new UTF8("_PaRt_");
-  private static final UTF8 SEGMENT_NAME_KEY = new UTF8("_NaMe_");
+  private static final Text SEGMENT_PART_KEY = new Text("_PaRt_");
+  private static final Text SEGMENT_NAME_KEY = new Text("_NaMe_");
   private static final String nameMarker = SEGMENT_NAME_KEY.toString();
-  private static final UTF8 SEGMENT_SLICE_KEY = new UTF8("_SlIcE_");
+  private static final Text SEGMENT_SLICE_KEY = new Text("_SlIcE_");
   private static final String sliceMarker = SEGMENT_SLICE_KEY.toString();
 
   private URLFilters filters = null;
@@ -140,8 +140,8 @@
           Object o = wrapper.get();
           if (o instanceof CrawlDatum) {
             // record which part of segment this comes from
-            ((CrawlDatum)o).getMetaData().put(SEGMENT_PART_KEY, new UTF8(part));
-            ((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new UTF8(segment));
+            ((CrawlDatum)o).getMetaData().put(SEGMENT_PART_KEY, new Text(part));
+            ((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new Text(segment));
           } else if (o instanceof Content) {
             if (((Content)o).getMetadata() == null) {
               ((Content)o).setMetadata(new Metadata());
@@ -186,12 +186,12 @@
           String slice = null;
           if (o instanceof CrawlDatum) {
             // check which output dir it should go into
-            UTF8 part = (UTF8)((CrawlDatum)o).getMetaData().get(SEGMENT_PART_KEY);
+            Text part = (Text)((CrawlDatum)o).getMetaData().get(SEGMENT_PART_KEY);
             ((CrawlDatum)o).getMetaData().remove(SEGMENT_PART_KEY);
             ((CrawlDatum)o).getMetaData().remove(SEGMENT_NAME_KEY);
             if (part == null)
               throw new IOException("Null segment part, key=" + key);
-            UTF8 uSlice = (UTF8)((CrawlDatum)o).getMetaData().get(SEGMENT_SLICE_KEY);
+            Text uSlice = (Text)((CrawlDatum)o).getMetaData().get(SEGMENT_SLICE_KEY);
             ((CrawlDatum)o).getMetaData().remove(SEGMENT_SLICE_KEY);
             if (uSlice != null) slice = uSlice.toString();
             String partString = part.toString();
@@ -267,7 +267,7 @@
           } else {
             wname = new Path(new Path(new Path(job.getOutputPath(), segmentName + "-" + slice), dirName), name);
           }
-          res = new SequenceFile.Writer(fs, wname, UTF8.class, CrawlDatum.class);
+          res = new SequenceFile.Writer(fs, wname, Text.class, CrawlDatum.class);
           sliceWriters.put(slice + dirName, res);
           return res;
         }
@@ -283,7 +283,7 @@
           } else {
             wname = new Path(new Path(new Path(job.getOutputPath(), segmentName + "-" + slice), dirName), name);
           }
-          res = new MapFile.Writer(fs, wname.toString(), UTF8.class, clazz);
+          res = new MapFile.Writer(fs, wname.toString(), Text.class, clazz);
           sliceWriters.put(slice + dirName, res);
           return res;
         }
@@ -335,7 +335,7 @@
   public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
     if (filters != null) {
       try {
-        if (filters.filter(((UTF8)key).toString()) == null) {
+        if (filters.filter(((Text)key).toString()) == null) {
           return;
         }
       } catch (Exception e) {
@@ -373,10 +373,10 @@
       if (o instanceof CrawlDatum) {
         CrawlDatum val = (CrawlDatum)o;
         // check which output dir it belongs to
-        UTF8 part = (UTF8)val.getMetaData().get(SEGMENT_PART_KEY);
+        Text part = (Text)val.getMetaData().get(SEGMENT_PART_KEY);
         if (part == null)
           throw new IOException("Null segment part, key=" + key);
-        UTF8 uName = (UTF8)val.getMetaData().get(SEGMENT_NAME_KEY);
+        Text uName = (Text)val.getMetaData().get(SEGMENT_NAME_KEY);
         if (uName == null)
           throw new IOException("Null segment name, key=" + key);
         String name = uName.toString();
@@ -470,10 +470,10 @@
       }
     }
     curCount++;
-    UTF8 sliceName = null;
+    Text sliceName = null;
     ObjectWritable wrapper = new ObjectWritable();
     if (sliceSize > 0) {
-      sliceName = new UTF8(String.valueOf(curCount / sliceSize));
+      sliceName = new Text(String.valueOf(curCount / sliceSize));
     }
     // now output the latest values
     if (lastG != null) {
@@ -613,12 +613,12 @@
       }
     }
     job.setInputFormat(ObjectInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(ObjectWritable.class);
     job.setMapperClass(SegmentMerger.class);
     job.setReducerClass(SegmentMerger.class);
     job.setOutputPath(out);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(ObjectWritable.class);
     job.setOutputFormat(SegmentOutputFormat.class);
     
Index: src/java/org/apache/nutch/segment/SegmentReader.java
===================================================================
--- src/java/org/apache/nutch/segment/SegmentReader.java	(revision 449084)
+++ src/java/org/apache/nutch/segment/SegmentReader.java	(working copy)
@@ -180,7 +180,7 @@
     if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));
 
     job.setInputFormat(InputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(ObjectWritable.class);
 
     job.setReducerClass(SegmentReader.class);
@@ -190,7 +190,7 @@
     
     job.setOutputPath(tempDir);
     job.setOutputFormat(TextOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(ObjectWritable.class);
 
     JobClient.runJob(job);
@@ -255,7 +255,7 @@
           {"pt", "ParseText::\n"}
   };
 
-  public void get(final Path segment, final UTF8 key, Writer writer,
+  public void get(final Path segment, final Text key, Writer writer,
           final Map results) throws Exception {
     if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: get '" + key + "'"); }
     ArrayList threads = new ArrayList();
@@ -346,12 +346,12 @@
     }
   }
   
-  private List getMapRecords(Path dir, UTF8 key) throws Exception {
+  private List getMapRecords(Path dir, Text key) throws Exception {
     MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());
     ArrayList res = new ArrayList();
     Class keyClass = readers[0].getKeyClass();
     Class valueClass = readers[0].getValueClass();
-    if (!keyClass.getName().equals("org.apache.hadoop.io.UTF8"))
+    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
       throw new IOException("Incompatible key (" + keyClass.getName() + ")");
     Writable value = (Writable)valueClass.newInstance();
     // we don't know the partitioning schema
@@ -363,12 +363,12 @@
     return res;
   }
 
-  private List getSeqRecords(Path dir, UTF8 key) throws Exception {
+  private List getSeqRecords(Path dir, Text key) throws Exception {
     SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);
     ArrayList res = new ArrayList();
     Class keyClass = readers[0].getKeyClass();
     Class valueClass = readers[0].getValueClass();
-    if (!keyClass.getName().equals("org.apache.hadoop.io.UTF8"))
+    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
       throw new IOException("Incompatible key (" + keyClass.getName() + ")");
     Writable aKey = (Writable)keyClass.newInstance();
     Writable value = (Writable)valueClass.newInstance();
@@ -423,7 +423,7 @@
   public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
     SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
     long cnt = 0L;
-    UTF8 key = new UTF8();
+    Text key = new Text();
     for (int i = 0; i < readers.length; i++) {
       while (readers[i].next(key)) cnt++;
       readers[i].close();
@@ -566,7 +566,7 @@
           usage();
           return;
         }
-        segmentReader.get(new Path(input), new UTF8(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap());
+        segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap());
         return;
       default:
         System.err.println("Invalid operation: " + args[0]);
Index: src/java/org/apache/nutch/scoring/ScoringFilter.java
===================================================================
--- src/java/org/apache/nutch/scoring/ScoringFilter.java	(revision 449274)
+++ src/java/org/apache/nutch/scoring/ScoringFilter.java	(working copy)
@@ -18,7 +18,7 @@
 import java.util.List;
 
 import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.lucene.document.Document;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -49,7 +49,7 @@
    * @param datum new datum. Filters will modify it in-place.
    * @throws ScoringFilterException
    */
-  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+  public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException;
   
   /**
    * Set an initial score for newly discovered pages. Note: newly discovered pages
@@ -60,7 +60,7 @@
    * @param datum new datum. Filters will modify it in-place.
    * @throws ScoringFilterException
    */
-  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+  public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException;
   
   /**
    * This method prepares a sort value for the purpose of sorting and
@@ -69,7 +69,7 @@
    * @param datum page's datum, should not be modified
    * @param initSort initial sort value, or a value from previous filters in chain
    */
-  public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException;
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException;
   
   /**
    * This method takes all relevant score information from the current datum
@@ -82,7 +82,7 @@
    * @param content instance of content. Implementations may modify this
    * in-place, primarily by setting some metadata properties.
    */
-  public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) throws ScoringFilterException;
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException;
   
   /**
    * Currently a part of score distribution is performed using only data coming
@@ -93,7 +93,7 @@
    * @param parse target instance to copy the score information to. Implementations
    * may modify this in-place, primarily by setting some metadata properties.
    */
-  public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) throws ScoringFilterException;
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException;
   
   /**
    * Distribute score value from the current page to all its outlinked pages.
@@ -116,7 +116,7 @@
    * be null if not needed.
    * @throws ScoringFilterException
    */
-  public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl,
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
           ParseData parseData, CrawlDatum target, CrawlDatum adjust,
           int allCount, int validCount) throws ScoringFilterException;
 
@@ -136,7 +136,7 @@
    * links pointing to this page, found in the current update batch.
    * @throws ScoringFilterException
    */
-  public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException;
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException;
   
   /**
    * This method calculates a Lucene document boost.
@@ -156,6 +156,6 @@
    * other scoring strategies by modifying Lucene document directly.
    * @throws ScoringFilterException
    */
-  public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum,
+  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
           CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException;
 }
Index: src/java/org/apache/nutch/scoring/ScoringFilters.java
===================================================================
--- src/java/org/apache/nutch/scoring/ScoringFilters.java	(revision 449274)
+++ src/java/org/apache/nutch/scoring/ScoringFilters.java	(working copy)
@@ -32,7 +32,7 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 /**
  * Creates and caches {@link ScoringFilter} implementing plugins.
@@ -85,7 +85,7 @@
   }
 
   /** Calculate a sort value for Generate. */
-  public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       initSort = this.filters[i].generatorSortValue(url, datum, initSort);
     }
@@ -93,46 +93,46 @@
   }
 
   /** Calculate a new initial score, used when adding newly discovered pages. */
-  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+  public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].initialScore(url, datum);
     }
   }
 
   /** Calculate a new initial score, used when injecting new pages. */
-  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+  public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].injectedScore(url, datum);
     }
   }
 
   /** Calculate updated page score during CrawlDb.update(). */
-  public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].updateDbScore(url, old, datum, inlinked);
     }
   }
 
-  public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) throws ScoringFilterException {
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].passScoreBeforeParsing(url, datum, content);
     }
   }
   
-  public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) throws ScoringFilterException {
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].passScoreAfterParsing(url, content, parse);
     }
   }
   
-  public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       adjust = this.filters[i].distributeScoreToOutlink(fromUrl, toUrl, parseData, target, adjust, allCount, validCount);
     }
     return adjust;
   }
 
-  public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, parse, inlinks, initScore);
     }
Index: src/java/org/apache/nutch/crawl/CrawlDbReader.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReader.java	(revision 449084)
+++ src/java/org/apache/nutch/crawl/CrawlDbReader.java	(working copy)
@@ -33,7 +33,7 @@
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobClient;
@@ -88,10 +88,10 @@
     public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
             throws IOException {
       CrawlDatum cd = (CrawlDatum) value;
-      output.collect(new UTF8("T"), COUNT_1);
-      output.collect(new UTF8("status " + cd.getStatus()), COUNT_1);
-      output.collect(new UTF8("retry " + cd.getRetriesSinceFetch()), COUNT_1);
-      output.collect(new UTF8("s"), new LongWritable((long) (cd.getScore() * 1000.0)));
+      output.collect(new Text("T"), COUNT_1);
+      output.collect(new Text("status " + cd.getStatus()), COUNT_1);
+      output.collect(new Text("retry " + cd.getRetriesSinceFetch()), COUNT_1);
+      output.collect(new Text("s"), new LongWritable((long) (cd.getScore() * 1000.0)));
     }
   }
   
@@ -104,7 +104,7 @@
     public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
         throws IOException {
       val.set(0L);
-      String k = ((UTF8)key).toString();
+      String k = ((Text)key).toString();
       if (!k.equals("s")) {
         while (values.hasNext()) {
           LongWritable cnt = (LongWritable)values.next();
@@ -121,9 +121,9 @@
           if (cnt.get() > max) max = cnt.get();
           total += cnt.get();
         }
-        output.collect(new UTF8("scn"), new LongWritable(min));
-        output.collect(new UTF8("scx"), new LongWritable(max));
-        output.collect(new UTF8("sct"), new LongWritable(total));
+        output.collect(new Text("scn"), new LongWritable(min));
+        output.collect(new Text("scx"), new LongWritable(max));
+        output.collect(new Text("sct"), new LongWritable(total));
       }
     }
   }
@@ -134,7 +134,7 @@
     public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
             throws IOException {
 
-      String k = ((UTF8) key).toString();
+      String k = ((Text) key).toString();
       if (k.equals("T")) {
         // sum all values for this key
         long sum = 0;
@@ -244,7 +244,7 @@
 
     job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setMapperClass(CrawlDbStatMapper.class);
@@ -253,7 +253,7 @@
 
     job.setOutputPath(tmpFolder);
     job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(LongWritable.class);
 
     JobClient.runJob(job);
@@ -262,7 +262,7 @@
     FileSystem fileSystem = FileSystem.get(config);
     SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);
 
-    UTF8 key = new UTF8();
+    Text key = new Text();
     LongWritable value = new LongWritable();
 
     TreeMap stats = new TreeMap();
@@ -315,7 +315,7 @@
   }
   
   public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
-    UTF8 key = new UTF8(url);
+    Text key = new Text(url);
     CrawlDatum val = new CrawlDatum();
     openReaders(crawlDb, config);
     CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner(), key, val);
@@ -346,12 +346,12 @@
 
     job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setOutputPath(outFolder);
     job.setOutputFormat(TextOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
 
     JobClient.runJob(job);
@@ -375,7 +375,7 @@
     job.setJobName("topN prepare " + crawlDb);
     job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
     job.setMapperClass(CrawlDbTopNMapper.class);
     job.setReducerClass(IdentityReducer.class);
@@ -383,7 +383,7 @@
     job.setOutputPath(tempDir);
     job.setOutputFormat(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(FloatWritable.class);
-    job.setOutputValueClass(UTF8.class);
+    job.setOutputValueClass(Text.class);
 
     // XXX hmmm, no setFloat() in the API ... :(
     job.setLong("CrawlDbReader.topN.min", Math.round(1000000.0 * min));
@@ -399,14 +399,14 @@
     job.addInputPath(tempDir);
     job.setInputFormat(SequenceFileInputFormat.class);
     job.setInputKeyClass(FloatWritable.class);
-    job.setInputValueClass(UTF8.class);
+    job.setInputValueClass(Text.class);
     job.setMapperClass(IdentityMapper.class);
     job.setReducerClass(CrawlDbTopNReducer.class);
 
     job.setOutputPath(outFolder);
     job.setOutputFormat(TextOutputFormat.class);
     job.setOutputKeyClass(FloatWritable.class);
-    job.setOutputValueClass(UTF8.class);
+    job.setOutputValueClass(Text.class);
 
     // XXX *sigh* this apparently doesn't work ... :-((
     job.setNumReduceTasks(1); // create a single file.
Index: src/java/org/apache/nutch/crawl/LinkDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/LinkDb.java	(revision 449088)
+++ src/java/org/apache/nutch/crawl/LinkDb.java	(working copy)
@@ -161,7 +161,7 @@
         anchor = anchor.substring(0, maxAnchorLength);
       }
       inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
-      output.collect(new UTF8(toUrl), inlinks);
+      output.collect(new Text(toUrl), inlinks);
     }
   }
 
@@ -256,7 +256,7 @@
     job.setJobName("linkdb " + linkDb);
 
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(ParseData.class);
 
     job.setMapperClass(LinkDb.class);
@@ -277,7 +277,7 @@
     job.setOutputPath(newLinkDb);
     job.setOutputFormat(MapFileOutputFormat.class);
     job.setBoolean("mapred.output.compress", true);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(Inlinks.class);
 
     return job;
@@ -292,7 +292,7 @@
     job.setJobName("linkdb merge " + linkDb);
 
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(Inlinks.class);
 
     job.setMapperClass(LinkDbFilter.class);
@@ -303,7 +303,7 @@
     job.setOutputPath(newLinkDb);
     job.setOutputFormat(MapFileOutputFormat.class);
     job.setBoolean("mapred.output.compress", true);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(Inlinks.class);
 
     return job;
Index: src/java/org/apache/nutch/crawl/MapWritable.java
===================================================================
--- src/java/org/apache/nutch/crawl/MapWritable.java	(revision 449084)
+++ src/java/org/apache/nutch/crawl/MapWritable.java	(working copy)
@@ -39,7 +39,7 @@
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 
@@ -81,7 +81,7 @@
 
     addToMap(NullWritable.class, new Byte((byte) -127));
     addToMap(LongWritable.class, new Byte((byte) -126));
-    addToMap(UTF8.class, new Byte((byte) -125));
+    addToMap(Text.class, new Byte((byte) -125));
     addToMap(MD5Hash.class, new Byte((byte) -124));
     addToMap(org.apache.nutch.fetcher.FetcherOutput.class,
         new Byte((byte) -123));
@@ -305,7 +305,7 @@
         ClassIdEntry entry = fIdFirst;
         while (entry != null) {
           out.writeByte(entry.fId);
-          UTF8.writeString(out, entry.fclazz.getName());
+          Text.writeString(out, entry.fclazz.getName());
           entry = entry.fNextIdEntry;
         }
       }
@@ -336,7 +336,7 @@
       for (int i = 0; i < fIdCount; i++) {
         try {
           id = in.readByte();
-          clazz = Class.forName(UTF8.readString(in));
+          clazz = Class.forName(Text.readString(in));
           addIdEntry(id, clazz);
         } catch (Exception e) {
           if (LOG.isWarnEnabled()) { 
Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java	(revision 449274)
+++ src/java/org/apache/nutch/crawl/Injector.java	(working copy)
@@ -28,13 +28,13 @@
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
 
 import org.apache.nutch.net.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
 
 /** This class takes a flat file of URLs and adds them to the of pages to be
  * crawled.  Useful for bootstrapping the system. */
@@ -65,7 +65,7 @@
     public void map(WritableComparable key, Writable val,
                     OutputCollector output, Reporter reporter)
       throws IOException {
-      UTF8 value = (UTF8)val;
+      Text value = (Text)val;
       String url = value.toString();              // value is line of text
       // System.out.println("url: " +url);
       try {
@@ -138,7 +138,7 @@
 
     sortJob.setOutputPath(tempDir);
     sortJob.setOutputFormat(SequenceFileOutputFormat.class);
-    sortJob.setOutputKeyClass(UTF8.class);
+    sortJob.setOutputKeyClass(Text.class);
     sortJob.setOutputValueClass(CrawlDatum.class);
     JobClient.runJob(sortJob);
 
Index: src/java/org/apache/nutch/crawl/CrawlDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDb.java	(revision 450799)
+++ src/java/org/apache/nutch/crawl/CrawlDb.java	(working copy)
@@ -97,15 +97,13 @@
       job.addInputPath(current);
     }
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
-    job.setInputValueClass(CrawlDatum.class);
 
     job.setMapperClass(CrawlDbFilter.class);
     job.setReducerClass(CrawlDbReducer.class);
 
     job.setOutputPath(newCrawlDb);
     job.setOutputFormat(MapFileOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
 
     return job;
Index: src/java/org/apache/nutch/crawl/CrawlDbMerger.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbMerger.java	(revision 449084)
+++ src/java/org/apache/nutch/crawl/CrawlDbMerger.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.StringUtils;
@@ -119,7 +119,7 @@
     job.setJobName("crawldb merge " + output);
 
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setMapperClass(CrawlDbFilter.class);
@@ -129,7 +129,7 @@
 
     job.setOutputPath(newCrawlDb);
     job.setOutputFormat(MapFileOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
 
     return job;
Index: src/java/org/apache/nutch/crawl/CrawlDbFilter.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbFilter.java	(revision 449088)
+++ src/java/org/apache/nutch/crawl/CrawlDbFilter.java	(working copy)
@@ -20,7 +20,7 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobConf;
@@ -92,7 +92,7 @@
       }
     }
     if (url != null) { // if it passes
-      UTF8 newKey = (UTF8) key;
+      Text newKey = (Text) key;
       newKey.set(url); // collect it
       output.collect(newKey, value);
     }
Index: src/java/org/apache/nutch/crawl/Inlink.java
===================================================================
--- src/java/org/apache/nutch/crawl/Inlink.java	(revision 449084)
+++ src/java/org/apache/nutch/crawl/Inlink.java	(working copy)
@@ -33,19 +33,19 @@
   }
 
   public void readFields(DataInput in) throws IOException {
-    fromUrl = UTF8.readString(in);
-    anchor = UTF8.readString(in);
+    fromUrl = Text.readString(in);
+    anchor = Text.readString(in);
   }
 
   /** Skips over one Inlink in the input. */
   public static void skip(DataInput in) throws IOException {
-    UTF8.skip(in);                                // skip fromUrl
-    UTF8.skip(in);                                // skip anchor
+    Text.skip(in);                                // skip fromUrl
+    Text.skip(in);                                // skip anchor
   }
 
   public void write(DataOutput out) throws IOException {
-    UTF8.writeString(out, fromUrl);
-    UTF8.writeString(out, anchor);
+    Text.writeString(out, fromUrl);
+    Text.writeString(out, anchor);
   }
 
   public static Inlink read(DataInput in) throws IOException {
Index: src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
===================================================================
--- src/java/org/apache/nutch/crawl/PartitionUrlByHost.java	(revision 449088)
+++ src/java/org/apache/nutch/crawl/PartitionUrlByHost.java	(working copy)
@@ -42,7 +42,7 @@
   /** Hash by hostname. */
   public int getPartition(WritableComparable key, Writable value,
                           int numReduceTasks) {
-    String urlString = ((UTF8)key).toString();
+    String urlString = ((Text)key).toString();
     try {
       urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
     } catch (Exception e) {
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 449088)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -46,11 +46,11 @@
   public static final Log LOG = LogFactory.getLog(Generator.class);
   
   public static class SelectorEntry implements Writable {
-    public UTF8 url;
+    public Text url;
     public CrawlDatum datum;
     
     public SelectorEntry() {
-      url = new UTF8();
+      url = new Text();
       datum = new CrawlDatum();
     }
 
@@ -102,7 +102,7 @@
     public void map(WritableComparable key, Writable value,
                     OutputCollector output, Reporter reporter)
       throws IOException {
-      UTF8 url = (UTF8)key;
+      Text url = (Text)key;
       // don't generate URLs that don't pass URLFilters
       try {
         if (filters.filter(url.toString()) == null)
@@ -122,7 +122,7 @@
 
       float sort = 1.0f;
       try {
-        sort = scfilters.generatorSortValue((UTF8)key, crawlDatum, sort);
+        sort = scfilters.generatorSortValue((Text)key, crawlDatum, sort);
       } catch (ScoringFilterException sfe) {
         if (LOG.isWarnEnabled()) {
           LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
@@ -131,7 +131,7 @@
       // sort by decreasing score, using DecreasingFloatComparator
       sortValue.set(sort);
       entry.datum = crawlDatum;
-      entry.url = (UTF8)key;
+      entry.url = (Text)key;
       output.collect(sortValue, entry);          // invert for sort by score
     }
 
@@ -150,7 +150,7 @@
       while (values.hasNext() && count < limit) {
 
         SelectorEntry entry = (SelectorEntry)values.next();
-        UTF8 url = entry.url;
+        Text url = entry.url;
 
         if (maxPerHost > 0) {                     // are we counting hosts?
           String host = new URL(url.toString()).getHost();
@@ -236,11 +236,11 @@
   
   /** Sort fetch lists by hash of URL. */
   public static class HashComparator extends WritableComparator {
-    public HashComparator() { super(UTF8.class); }
+    public HashComparator() { super(Text.class); }
 
     public int compare(WritableComparable a, WritableComparable b) {
-      UTF8 url1 = (UTF8)a;
-      UTF8 url2 = (UTF8)b;
+      Text url1 = (Text)a;
+      Text url2 = (Text)b;
       int hash1 = hash(url1.getBytes(), 0, url1.getLength());
       int hash2 = hash(url2.getBytes(), 0, url2.getLength());
       if (hash1 != hash2) {
@@ -252,14 +252,12 @@
 
 
     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-      int n1 = readUnsignedShort(b1, s1);
-      int n2 = readUnsignedShort(b2, s2);
-      int hash1 = hash(b1, s1+2, n1);
-      int hash2 = hash(b2, s2+2, n2);
+      int hash1 = hash(b1, s1, l1);
+      int hash2 = hash(b2, s2, l2);
       if (hash1 != hash2) {
         return hash1 - hash2;
       }
-      return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
+      return compareBytes(b1, s1, l1, b2, s2, l2);
     }
 
     private static int hash(byte[] bytes, int start, int length) {
@@ -319,7 +317,7 @@
 
     job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(CrawlDatum.class);
 
     job.setMapperClass(Selector.class);
@@ -353,7 +351,7 @@
 
     job.setOutputPath(output);
     job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);
     job.setOutputKeyComparatorClass(HashComparator.class);
     JobClient.runJob(job);
Index: src/java/org/apache/nutch/crawl/LinkDbReader.java
===================================================================
--- src/java/org/apache/nutch/crawl/LinkDbReader.java	(revision 449084)
+++ src/java/org/apache/nutch/crawl/LinkDbReader.java	(working copy)
@@ -59,14 +59,14 @@
     this.directory = directory;
   }
 
-  public String[] getAnchors(UTF8 url) throws IOException {
+  public String[] getAnchors(Text url) throws IOException {
     Inlinks inlinks = getInlinks(url);
     if (inlinks == null)
       return null;
     return inlinks.getAnchors();
   }
 
-  public Inlinks getInlinks(UTF8 url) throws IOException {
+  public Inlinks getInlinks(Text url) throws IOException {
 
     if (readers == null) {
       synchronized(this) {
@@ -100,12 +100,12 @@
 
     job.addInputPath(new Path(linkdb, LinkDb.CURRENT_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(Inlinks.class);
 
     job.setOutputPath(outFolder);
     job.setOutputFormat(TextOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(Inlinks.class);
 
     JobClient.runJob(job);
@@ -129,7 +129,7 @@
         return 0;
       } else if (args[1].equals("-url")) {
         init(new Path(args[0]));
-        Inlinks links = getInlinks(new UTF8(args[2]));
+        Inlinks links = getInlinks(new Text(args[2]));
         if (links == null) {
           System.out.println(" - no link information.");
         } else {
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 450799)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -109,7 +109,7 @@
       } else {
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
         try {
-          scfilters.initialScore((UTF8)key, result);
+          scfilters.initialScore((Text)key, result);
         } catch (ScoringFilterException e) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Cannot filter init score for url " + key +
@@ -152,7 +152,7 @@
     }
 
     try {
-      scfilters.updateDbScore((UTF8)key, old, result, linked);
+      scfilters.updateDbScore((Text)key, old, result, linked);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Couldn't update score, key=" + key + ": " + e);
Index: src/java/org/apache/nutch/parse/ParseData.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseData.java	(revision 449084)
+++ src/java/org/apache/nutch/parse/ParseData.java	(working copy)
@@ -119,7 +119,7 @@
       status = ParseStatus.read(in);
     else
       status = ParseStatus.STATUS_SUCCESS;
-    title = UTF8.readString(in);                   // read title
+    title = Text.readString(in);                   // read title
 
     int totalOutlinks = in.readInt();             // read outlinks
     int maxOutlinksPerPage = this.conf.getInt("db.max.outlinks.per.page", 100);
@@ -139,7 +139,7 @@
       int propertyCount = in.readInt();             // read metadata
       contentMeta = new Metadata();
       for (int i = 0; i < propertyCount; i++) {
-        contentMeta.add(UTF8.readString(in), UTF8.readString(in));
+        contentMeta.add(Text.readString(in), Text.readString(in));
       }
     } else {
       contentMeta = new Metadata();
@@ -154,7 +154,7 @@
   public final void write(DataOutput out) throws IOException {
     out.writeByte(VERSION);                       // write version
     status.write(out);                            // write status
-    UTF8.writeString(out, title);                 // write title
+    Text.writeString(out, title);                 // write title
 
     out.writeInt(outlinks.length);                // write outlinks
     for (int i = 0; i < outlinks.length; i++) {
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java	(revision 449102)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java	(working copy)
@@ -66,13 +66,13 @@
       new Path(new Path(job.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), name);
     
     final MapFile.Writer textOut =
-      new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class);
+      new MapFile.Writer(fs, text.toString(), Text.class, ParseText.class);
     
     final MapFile.Writer dataOut =
-      new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true);
+      new MapFile.Writer(fs, data.toString(), Text.class,ParseData.class,true);
     
     final SequenceFile.Writer crawlOut =
-      new SequenceFile.Writer(fs, crawl, UTF8.class, CrawlDatum.class);
+      new SequenceFile.Writer(fs, crawl, Text.class, CrawlDatum.class);
     
     return new RecordWriter() {
 
@@ -141,10 +141,10 @@
               }
             }
             CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
-            UTF8 targetUrl = new UTF8(toUrls[i]);
+            Text targetUrl = new Text(toUrls[i]);
             adjust = null;
             try {
-              adjust = scfilters.distributeScoreToOutlink((UTF8)key, targetUrl,
+              adjust = scfilters.distributeScoreToOutlink((Text)key, targetUrl,
                       parseData, target, null, links.length, validCount);
             } catch (ScoringFilterException e) {
               if (LOG.isWarnEnabled()) {
Index: src/java/org/apache/nutch/parse/ParserChecker.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserChecker.java	(revision 449084)
+++ src/java/org/apache/nutch/parse/ParserChecker.java	(working copy)
@@ -24,7 +24,7 @@
 import org.apache.nutch.util.NutchConfiguration;
 
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.parse.ParseUtil;
 
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -75,7 +75,7 @@
     Configuration conf = NutchConfiguration.create();
     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
-    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
 
     if (force) {
       content.setContentType(contentType);
Index: src/java/org/apache/nutch/parse/Outlink.java
===================================================================
--- src/java/org/apache/nutch/parse/Outlink.java	(revision 449088)
+++ src/java/org/apache/nutch/parse/Outlink.java	(working copy)
@@ -37,19 +37,19 @@
   }
 
   public void readFields(DataInput in) throws IOException {
-    toUrl = UTF8.readString(in);
-    anchor = UTF8.readString(in);
+    toUrl = Text.readString(in);
+    anchor = Text.readString(in);
   }
 
   /** Skips over one Outlink in the input. */
   public static void skip(DataInput in) throws IOException {
-    UTF8.skip(in);                                // skip toUrl
-    UTF8.skip(in);                                // skip anchor
+    Text.skip(in);                                // skip toUrl
+    Text.skip(in);                                // skip anchor
   }
 
   public void write(DataOutput out) throws IOException {
-    UTF8.writeString(out, toUrl);
-    UTF8.writeString(out, anchor);
+    Text.writeString(out, toUrl);
+    Text.writeString(out, anchor);
   }
 
   public static Outlink read(DataInput in) throws IOException {
Index: src/java/org/apache/nutch/parse/ParseSegment.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseSegment.java	(revision 449084)
+++ src/java/org/apache/nutch/parse/ParseSegment.java	(working copy)
@@ -75,7 +75,7 @@
     
     if (status.isSuccess()) {
       try {
-        scfilters.passScoreAfterParsing((UTF8)key, content, parse);
+        scfilters.passScoreAfterParsing((Text)key, content, parse);
       } catch (ScoringFilterException e) {
         if (LOG.isWarnEnabled()) {
           e.printStackTrace(LogUtil.getWarnStream(LOG));
@@ -107,14 +107,14 @@
 
     job.setInputPath(new Path(segment, Content.DIR_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(UTF8.class);
+    job.setInputKeyClass(Text.class);
     job.setInputValueClass(Content.class);
     job.setMapperClass(ParseSegment.class);
     job.setReducerClass(ParseSegment.class);
     
     job.setOutputPath(segment);
     job.setOutputFormat(ParseOutputFormat.class);
-    job.setOutputKeyClass(UTF8.class);
+    job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(ParseImpl.class);
 
     JobClient.runJob(job);
Index: src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
===================================================================
--- src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java	(revision 449084)
+++ src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java	(working copy)
@@ -21,7 +21,7 @@
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.parse.Parse;
 
 // Hadoop imports
@@ -47,7 +47,7 @@
 
 
   // Inherited JavaDoc
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
 
     // Check if some Rel-Tags found, possibly put there by RelTagParser
Index: src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java	(revision 449084)
+++ src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 import junit.framework.TestCase;
@@ -68,7 +68,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
       parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);
 
       assertTrue(parse.getText().startsWith(expectedText));
Index: src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
===================================================================
--- src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java	(revision 449084)
+++ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java	(working copy)
@@ -19,6 +19,8 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
@@ -26,7 +28,7 @@
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -43,7 +45,7 @@
   private int MAX_TITLE_LENGTH;
   private Configuration conf;
 
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
     
     String host = null;
@@ -87,6 +89,11 @@
     }
     // add title indexed and stored so that it can be displayed
     doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+    
+    // add timestamp when fetched, for deduplication
+    doc.add(new Field("tstamp",
+        DateTools.timeToString(datum.getFetchTime(), DateTools.Resolution.MILLISECOND),
+        Field.Store.YES, Field.Index.NO));
 
     return doc;
   }
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java	(revision 449084)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java	(working copy)
@@ -21,7 +21,7 @@
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
@@ -64,7 +64,7 @@
   }
 
   // Inherited JavaDoc
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
 
     // check if LANGUAGE found, possibly put there by HTMLLanguageParser
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java	(revision 449084)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java	(working copy)
@@ -37,7 +37,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 // Nutch imports
 import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
@@ -344,7 +344,7 @@
     Protocol protocol;
     try {
       protocol = new ProtocolFactory(conf).getProtocol(url);
-      Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+      Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
       Parse parse = new ParseUtil(conf).parse(content);
       System.out.println("text:" + parse.getText());
       return parse.getText();
Index: src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
===================================================================
--- src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java	(revision 449274)
+++ src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java	(working copy)
@@ -25,7 +25,7 @@
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.lucene.document.Document;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -73,23 +73,23 @@
   }
 
   /** Set to the value defined in config, 1.0f by default. */
-  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+  public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
     datum.setScore(scoreInjected);
   }
 
   /** Set to 0.0f (unknown value) - inlink contributions will bring it to
    * a correct level. Newly discovered pages have at least one inlink. */
-  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+  public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
     datum.setScore(0.0f);
   }
 
   /** Use {@link CrawlDatum#getScore()}. */
-  public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
     return datum.getScore();
   }
 
   /** Increase the score by a sum of inlinked scores. */
-  public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
     float adjust = 0.0f;
     for (int i = 0; i < inlinked.size(); i++) {
       CrawlDatum linked = (CrawlDatum)inlinked.get(i);
@@ -100,17 +100,17 @@
   }
 
   /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
-  public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) {
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
     content.getMetadata().set(Fetcher.SCORE_KEY, "" + datum.getScore());
   }
 
   /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
-  public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) {
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
     parse.getData().getContentMeta().set(Fetcher.SCORE_KEY, content.getMetadata().get(Fetcher.SCORE_KEY));
   }
 
   /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
-  public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
     float score = scoreInjected;
     String scoreString = parseData.getContentMeta().get(Fetcher.SCORE_KEY);
     if (scoreString != null) {
@@ -146,7 +146,7 @@
   }
 
   /** Dampen the boost value by scorePower.*/
-  public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
     return (float)Math.pow(dbDatum.getScore(), scorePower);
   }
 }
Index: src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
===================================================================
--- src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java	(revision 449084)
+++ src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java	(working copy)
@@ -20,7 +20,7 @@
 import java.io.InputStreamReader;
 
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.Content;
@@ -85,7 +85,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
 
       parse = new ParseUtil(conf).parse(content);
 
Index: src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
===================================================================
--- src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java	(revision 449084)
+++ src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java	(working copy)
@@ -20,7 +20,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 
 /** 
@@ -61,7 +61,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString),
+      content = protocol.getProtocolOutput(new Text(urlString),
                                            new CrawlDatum()).getContent();
       parse = parser.parseByExtensionId("parse-msexcel", content);
 
Index: src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
===================================================================
--- src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java	(revision 449084)
+++ src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 import junit.framework.TestCase;
@@ -68,7 +68,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
       parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
       assertTrue(parse.getText().equals(expectedText));
     }
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(revision 449084)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java	(working copy)
@@ -20,7 +20,7 @@
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
@@ -67,7 +67,7 @@
   /** Set the point at which content is truncated. */
   public void setMaxContentLength(int length) {maxContentLength = length;}
 
-  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
@@ -141,7 +141,7 @@
     // set log level
     //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+    Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " +
Index: src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
===================================================================
--- src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java	(revision 449084)
+++ src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java	(working copy)
@@ -29,7 +29,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 import junit.framework.TestCase;
@@ -86,7 +86,7 @@
             urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
             protocol = new ProtocolFactory(conf).getProtocol(urlString);
-            content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+            content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
             parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);
 
             //check that there are 3 outlinks:
Index: src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
===================================================================
--- src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java	(revision 449084)
+++ src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.commons.logging.LogFactory;
 
 // Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration;
 
 // Nutch imports
@@ -216,7 +216,7 @@
     RSSParser parser = new RSSParser();
     parser.setConf(conf);
     Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
-    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
     Parse parse = parser.getParse(content);
     System.out.println("data: "+ parse.getData());
     System.out.println("text: "+parse.getText());
Index: src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java	(revision 449084)
+++ src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 import junit.framework.TestCase;
@@ -68,7 +68,7 @@
 
       Configuration conf = NutchConfiguration.create();
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
       parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
 
       int index = parse.getText().indexOf(expectedText);
Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
===================================================================
--- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(revision 449738)
+++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java	(working copy)
@@ -40,7 +40,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 
 /**
@@ -170,7 +170,7 @@
    
   
   
-  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     
     String urlString = url.toString();
     try {
@@ -504,7 +504,7 @@
 //      LOGGER.setLevel(Level.FINE);
 //    }
     
-    ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new CrawlDatum());
+    ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
     Content content = out.getContent();
     
     System.out.println("Status: " + out.getStatus());
Index: src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
===================================================================
--- src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java	(revision 449084)
+++ src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java	(working copy)
@@ -21,7 +21,7 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.protocol.*;
 
 import org.apache.nutch.parse.Parse;
@@ -87,7 +87,7 @@
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
 
       parse = parser.getParse(content);
 
Index: src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
===================================================================
--- src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(revision 449084)
+++ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java	(working copy)
@@ -22,7 +22,7 @@
 import org.apache.commons.net.ftp.FTPFileEntryParser;
 
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 
@@ -111,7 +111,7 @@
     this.keepConnection = keepConnection;
   }
 
-  public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
@@ -207,7 +207,7 @@
     // set log level
     //LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
 
-    Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+    Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
 
     System.err.println("Content-Type: " + content.getContentType());
     System.err.println("Content-Length: " +
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java	(revision 449084)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/LocalNutchInputComponent.java	(working copy)
@@ -1,123 +1,123 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import java.io.StringReader;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.Map;
-
-import org.apache.xerces.parsers.AbstractSAXParser;
-import org.cyberneko.html.HTMLConfiguration;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.nutch.searcher.HitDetails;
-
-import com.dawidweiss.carrot.core.local.LocalInputComponentBase;
-import com.dawidweiss.carrot.core.local.ProcessingException;
-import com.dawidweiss.carrot.core.local.RequestContext;
-import com.dawidweiss.carrot.core.local.clustering.*;
-
-/**
- * A local input component that ignores the query passed from the
- * controller and instead looks for data stored in the request context.
- * This enables us to reuse the same physical component implementation
- * for data that has already been acquired from Nutch.    
- *
- * @author Dawid Weiss
- * @version $Id: LocalNutchInputComponent.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
- */
-public class LocalNutchInputComponent extends LocalInputComponentBase {
-  public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY
-    = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
-
-  public final static String NUTCH_INPUT_SUMMARIES_ARRAY 
-    = "NUTCH_INPUT_SUMMARIES_ARRAY";
-
-  /** Capabilities required from the next component in the chain */
-  private final static Set SUCCESSOR_CAPABILITIES 
-    = new HashSet(Arrays.asList(new Object [] { RawDocumentsConsumer.class }));
-
-  /** This component's capabilities */
-  private final static Set COMPONENT_CAPABILITIES 
-    = new HashSet(Arrays.asList(new Object [] { RawDocumentsProducer.class }));
-
-  /**
-   * Default language code for hits that don't have their own.
-   */
-  private String defaultLanguage;
-
-  /**
-   * Creates an input component with the given default language code.
-   */
-  public LocalNutchInputComponent(String defaultLanguage) {
-    this.defaultLanguage = defaultLanguage;
-  }
-
-  /*
-   * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
-   */
-  public void setQuery(String query) {
-      // ignore the query; data will be provided from the request context.
-  }
-
-  /**
-   * A callback hook that starts the processing.
-   */
-  public void startProcessing(RequestContext context) throws ProcessingException {
-    // let successor components know that the processing has started.
-    super.startProcessing(context);
-    
-    // get the information about documents from the context.
-    final Map params = context.getRequestParameters();
-    final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
-    final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
-    
-    if (details == null)
-      throw new ProcessingException("Details array must not be null.");
-
-    if (summaries == null)
-      throw new ProcessingException("Summaries array must not be null.");
-    
-    if (summaries.length != details.length)
-      throw new ProcessingException("Summaries and details must be of the same length.");
-    
-    // produce 'documents' for successor components.
-    final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
-    for (int i=0;i<summaries.length;i++) {
-      consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage));
-    }
-  }
-
-  /**
-   * Returns the capabilities provided by this component.
-   */
-  public Set getComponentCapabilities() {
-    return COMPONENT_CAPABILITIES;
-  }
-    
-  /**
-   * Returns the capabilities required from the successor component.
-   */
-  public Set getRequiredSuccessorCapabilities() {
-    return SUCCESSOR_CAPABILITIES;
-  }
-
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.Map;
+
+import org.apache.xerces.parsers.AbstractSAXParser;
+import org.cyberneko.html.HTMLConfiguration;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.nutch.searcher.HitDetails;
+
+import com.dawidweiss.carrot.core.local.LocalInputComponentBase;
+import com.dawidweiss.carrot.core.local.ProcessingException;
+import com.dawidweiss.carrot.core.local.RequestContext;
+import com.dawidweiss.carrot.core.local.clustering.*;
+
+/**
+ * A local input component that ignores the query passed from the
+ * controller and instead looks for data stored in the request context.
+ * This enables us to reuse the same physical component implementation
+ * for data that has already been acquired from Nutch.    
+ *
+ * @author Dawid Weiss
+ * @version $Id: LocalNutchInputComponent.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
+ */
+public class LocalNutchInputComponent extends LocalInputComponentBase {
+  public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY
+    = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
+
+  public final static String NUTCH_INPUT_SUMMARIES_ARRAY 
+    = "NUTCH_INPUT_SUMMARIES_ARRAY";
+
+  /** Capabilities required from the next component in the chain */
+  private final static Set SUCCESSOR_CAPABILITIES 
+    = new HashSet(Arrays.asList(new Object [] { RawDocumentsConsumer.class }));
+
+  /** This component's capabilities */
+  private final static Set COMPONENT_CAPABILITIES 
+    = new HashSet(Arrays.asList(new Object [] { RawDocumentsProducer.class }));
+
+  /**
+   * Default language code for hits that don't have their own.
+   */
+  private String defaultLanguage;
+
+  /**
+   * Creates an input component with the given default language code.
+   */
+  public LocalNutchInputComponent(String defaultLanguage) {
+    this.defaultLanguage = defaultLanguage;
+  }
+
+  /*
+   * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
+   */
+  public void setQuery(String query) {
+      // ignore the query; data will be provided from the request context.
+  }
+
+  /**
+   * A callback hook that starts the processing.
+   */
+  public void startProcessing(RequestContext context) throws ProcessingException {
+    // let successor components know that the processing has started.
+    super.startProcessing(context);
+    
+    // get the information about documents from the context.
+    final Map params = context.getRequestParameters();
+    final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
+    final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
+    
+    if (details == null)
+      throw new ProcessingException("Details array must not be null.");
+
+    if (summaries == null)
+      throw new ProcessingException("Summaries array must not be null.");
+    
+    if (summaries.length != details.length)
+      throw new ProcessingException("Summaries and details must be of the same length.");
+    
+    // produce 'documents' for successor components.
+    final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
+    for (int i=0;i<summaries.length;i++) {
+      consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage));
+    }
+  }
+
+  /**
+   * Returns the capabilities provided by this component.
+   */
+  public Set getComponentCapabilities() {
+    return COMPONENT_CAPABILITIES;
+  }
+    
+  /**
+   * Returns the capabilities required from the successor component.
+   */
+  public Set getRequiredSuccessorCapabilities() {
+    return SUCCESSOR_CAPABILITIES;
+  }
+
+}
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java	(revision 449084)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java	(working copy)
@@ -1,72 +1,72 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import org.apache.nutch.searcher.HitDetails;
-
-import com.dawidweiss.carrot.core.local.clustering.RawDocument;
-import com.dawidweiss.carrot.core.local.clustering.RawDocumentBase;
-
-/**
- * An adapter class that implements {@link RawDocument} required for Carrot2.  
- *
- * @author Dawid Weiss
- * @version $Id: NutchDocument.java,v 1.2 2004/08/10 00:18:43 johnnx Exp $
- */
-public class NutchDocument extends RawDocumentBase {
-  /**
-   * Integer identifier of this document. We need a subclass of 
-   * {@link java.lang.Object}, so this should do.
-   */
-  private final Integer id;
-  
-  /**
-   * Creates a new document with the given id, <code>summary</code> and wrapping
-   * a <code>details</code> hit details.
-   */
-  public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
-    super.setProperty(RawDocument.PROPERTY_URL, details.getValue("url"));
-    super.setProperty(RawDocument.PROPERTY_SNIPPET, summary);
-
-    final String title = details.getValue("title");
-    if (title != null && !"".equals(title)) {
-      super.setProperty(RawDocument.PROPERTY_TITLE, title);
-    }
-    
-    String lang = details.getValue("lang");
-    if (lang == null) {
-      // No default language. Take the default from the configuration file.
-      lang = defaultLanguage;
-    }
-    // Use this language for the snippet. Truncate longer ISO codes
-    // to only include two-letter language code.
-    if (lang.length() > 2) {
-      lang = lang.substring(0, 2);
-    }
-    lang = lang.toLowerCase();
-    super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
-
-    this.id = new Integer(id);
-  }
-
-  /*
-   * @see com.dawidweiss.carrot.core.local.clustering.RawDocument#getId()
-   */
-  public Object getId() {
-    return id;
-  }
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import org.apache.nutch.searcher.HitDetails;
+
+import com.dawidweiss.carrot.core.local.clustering.RawDocument;
+import com.dawidweiss.carrot.core.local.clustering.RawDocumentBase;
+
+/**
+ * An adapter class that implements {@link RawDocument} required for Carrot2.  
+ *
+ * @author Dawid Weiss
+ * @version $Id: NutchDocument.java,v 1.2 2004/08/10 00:18:43 johnnx Exp $
+ */
+public class NutchDocument extends RawDocumentBase {
+  /**
+   * Integer identifier of this document. We need a subclass of 
+   * {@link java.lang.Object}, so this should do.
+   */
+  private final Integer id;
+  
+  /**
+   * Creates a new document with the given id, <code>summary</code> and wrapping
+   * a <code>details</code> hit details.
+   */
+  public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
+    super.setProperty(RawDocument.PROPERTY_URL, details.getValue("url"));
+    super.setProperty(RawDocument.PROPERTY_SNIPPET, summary);
+
+    final String title = details.getValue("title");
+    if (title != null && !"".equals(title)) {
+      super.setProperty(RawDocument.PROPERTY_TITLE, title);
+    }
+    
+    String lang = details.getValue("lang");
+    if (lang == null) {
+      // No default language. Take the default from the configuration file.
+      lang = defaultLanguage;
+    }
+    // Use this language for the snippet. Truncate longer ISO codes
+    // to only include two-letter language code.
+    if (lang.length() > 2) {
+      lang = lang.substring(0, 2);
+    }
+    lang = lang.toLowerCase();
+    super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
+
+    this.id = new Integer(id);
+  }
+
+  /*
+   * @see com.dawidweiss.carrot.core.local.clustering.RawDocument#getId()
+   */
+  public Object getId() {
+    return id;
+  }
+}
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java	(revision 449084)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java	(working copy)
@@ -1,245 +1,245 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import java.util.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.clustering.OnlineClusterer;
-import org.apache.nutch.searcher.HitDetails;
-
-import com.dawidweiss.carrot.core.local.*;
-import com.dawidweiss.carrot.core.local.clustering.RawCluster;
-import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
-import com.dawidweiss.carrot.core.local.linguistic.Language;
-import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
-import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
-
-
-/**
- * An plugin providing an implementation of {@link OnlineClusterer} 
- * extension using clustering components of the Carrot2 project
- * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
- * 
- * We hardcode the following Carrot2 process:
- * <pre><![CDATA[
- * <local-process id="yahoo-lingo">
- *   <name>Yahoo Search API -- Lingo Classic Clusterer</name>
- * 
- *   <input  component-key="input-localnutch" />
- *   <filter component-key="filter-lingo" />
- *   <output component-key="output-clustersConsumer" />
- * </local-process>
- * ]]></pre>
- *
- * @author Dawid Weiss
- * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
- */
-public class Clusterer implements OnlineClusterer, Configurable {
-  /** Default language property name. */
-  private final static String CONF_PROP_DEFAULT_LANGUAGE =
-    "extension.clustering.carrot2.defaultLanguage";
-
-  /** Recognizable languages property name. */
-  private final static String CONF_PROP_LANGUAGES =
-    "extension.clustering.carrot2.languages";
-
-  /** Internal clustering process ID in Carrot2 LocalController */
-  private final static String PROCESS_ID = "nutch-lingo";
-  
-  public static final Log logger = LogFactory.getLog(Clusterer.class);  
-
-  /** The LocalController instance used for clustering */
-  private LocalController controller;
-
-  /** Nutch configuration. */
-  private Configuration conf;
-
-  /** 
-   * Default language for hits. English by default, but may be changed
-   * via a property in Nutch configuration. 
-   */
-  private String defaultLanguage = "en";
-
-  /** 
-   * A list of recognizable languages..
-   * English only by default, but configurable via Nutch configuration.
-   */
-  private String [] languages = new String [] {defaultLanguage};
-
-  /**
-   * An empty public constructor for making new instances
-   * of the clusterer.
-   */
-  public Clusterer() {
-    initialize();
-  }
-
-  private synchronized void initialize() {
-    controller = new LocalControllerBase();
-    addComponentFactories();
-    addProcesses();
-  }
-
-  /** Adds the required component factories to a local Carrot2 controller. */
-  private void addComponentFactories() {
-    //  *   <input  component-key="input-localnutch" />
-    LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        return new LocalNutchInputComponent(defaultLanguage);
-      }
-    };
-    controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
-
-    // *   <filter component-key="filter-lingo" />
-    LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        HashMap defaults = new HashMap();
-
-        // These are adjustments settings for the clustering algorithm.
-        // If you try the live WebStart demo of Carrot2 you can see how they affect
-        // the final clustering: http://www.carrot2.org/webstart 
-        defaults.put("lsi.threshold.clusterAssignment", "0.150");
-        defaults.put("lsi.threshold.candidateCluster",  "0.775");
-
-        // Initialize a new Lingo clustering component.
-        ArrayList languageList = new ArrayList(languages.length);
-        for (int i = 0; i < languages.length; i++) {
-          final String lcode = languages[i];
-          try {
-            Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
-            if (lang == null) {
-              if (logger.isWarnEnabled()) {
-                logger.warn("Language not supported in Carrot2: " + lcode);
-              }
-            } else {
-              languageList.add(lang);
-              if (logger.isDebugEnabled()) {
-                logger.debug("Language loaded: " + lcode);
-              }
-            }
-          } catch (Throwable t) {
-            if (logger.isWarnEnabled()) {
-              logger.warn("Language could not be loaded: " + lcode, t);
-            }
-          }
-        }
-        return new LingoLocalFilterComponent(
-          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
-      }
-    };
-    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
-
-    // *   <output component-key="output-clustersConsumer" />
-    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
-      public LocalComponent getInstance() {
-        return new ClustersConsumerOutputComponent();
-      }
-    };
-    controller.addLocalComponentFactory("output-clustersConsumer", 
-      clusterConsumerOutputFactory);
-  }
-
-  /** 
-   * Adds a hardcoded clustering process to the local controller.
-   */  
-  private void addProcesses() {
-    LocalProcessBase process = new LocalProcessBase(
-        "input-localnutch",                                   // input
-        "output-clustersConsumer",                            // output
-        new String [] {"filter-lingo"},                       // filters
-        "The Lingo clustering algorithm (www.carrot2.org).",
-        "");
-
-    try {
-      controller.addProcess(PROCESS_ID, process);
-    } catch (Exception e) {
-      throw new RuntimeException("Could not assemble clustering process.", e);
-    }
-  }
-  
-  /**
-   * See {@link OnlineClusterer} for documentation.
-   */
-  public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
-    Map requestParams = new HashMap();
-    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
-      hitDetails);
-    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
-      descriptions);
-
-    try {
-      // The input component takes Nutch's results so we don't need the query argument.
-      final ProcessingResult result = 
-        controller.query(PROCESS_ID, "no-query", requestParams);
-
-      final ClustersConsumerOutputComponent.Result output =
-        (ClustersConsumerOutputComponent.Result) result.getQueryResult();
-
-      final List outputClusters = output.clusters;
-      final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
-
-      int j = 0;
-      for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
-        RawCluster rcluster = (RawCluster) i.next();
-        clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
-      }
-
-      // invoke Carrot2 process here.
-      return clusters;
-    } catch (MissingProcessException e) {
-      throw new RuntimeException("Missing clustering process.", e);
-    } catch (Exception e) {
-      throw new RuntimeException("Unidentified problems with the clustering.", e);
-    }
-  }
-
-  /**
-   * Implementation of {@link Configurable}
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    
-    // Configure default language and other component settings.
-    if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
-      // Change the default language.
-      this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
-    } 
-    if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
-      this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
-    }
-
-    if (logger.isInfoEnabled()) {
-      logger.info("Default language: " + defaultLanguage);
-      logger.info("Enabled languages: " + Arrays.asList(languages));
-    }
-
-    initialize();
-  }
-
-  /**
-   * Implementation of {@link Configurable}
-   */
-  public Configuration getConf() {
-    return conf;
-  }
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.clustering.HitsCluster;
+import org.apache.nutch.clustering.OnlineClusterer;
+import org.apache.nutch.searcher.HitDetails;
+
+import com.dawidweiss.carrot.core.local.*;
+import com.dawidweiss.carrot.core.local.clustering.RawCluster;
+import com.dawidweiss.carrot.core.local.impl.ClustersConsumerOutputComponent;
+import com.dawidweiss.carrot.core.local.linguistic.Language;
+import com.dawidweiss.carrot.util.tokenizer.languages.AllKnownLanguages;
+import com.stachoodev.carrot.filter.lingo.local.LingoLocalFilterComponent;
+
+
+/**
+ * An plugin providing an implementation of {@link OnlineClusterer} 
+ * extension using clustering components of the Carrot2 project
+ * (<a href="http://carrot2.sourceforge.net">http://carrot2.sourceforge.net</a>).
+ * 
+ * We hardcode the following Carrot2 process:
+ * <pre><![CDATA[
+ * <local-process id="yahoo-lingo">
+ *   <name>Yahoo Search API -- Lingo Classic Clusterer</name>
+ * 
+ *   <input  component-key="input-localnutch" />
+ *   <filter component-key="filter-lingo" />
+ *   <output component-key="output-clustersConsumer" />
+ * </local-process>
+ * ]]></pre>
+ *
+ * @author Dawid Weiss
+ * @version $Id: Clusterer.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
+ */
+public class Clusterer implements OnlineClusterer, Configurable {
+  /** Default language property name. */
+  private final static String CONF_PROP_DEFAULT_LANGUAGE =
+    "extension.clustering.carrot2.defaultLanguage";
+
+  /** Recognizable languages property name. */
+  private final static String CONF_PROP_LANGUAGES =
+    "extension.clustering.carrot2.languages";
+
+  /** Internal clustering process ID in Carrot2 LocalController */
+  private final static String PROCESS_ID = "nutch-lingo";
+  
+  public static final Log logger = LogFactory.getLog(Clusterer.class);  
+
+  /** The LocalController instance used for clustering */
+  private LocalController controller;
+
+  /** Nutch configuration. */
+  private Configuration conf;
+
+  /** 
+   * Default language for hits. English by default, but may be changed
+   * via a property in Nutch configuration. 
+   */
+  private String defaultLanguage = "en";
+
+  /** 
+   * A list of recognizable languages..
+   * English only by default, but configurable via Nutch configuration.
+   */
+  private String [] languages = new String [] {defaultLanguage};
+
+  /**
+   * An empty public constructor for making new instances
+   * of the clusterer.
+   */
+  public Clusterer() {
+    initialize();
+  }
+
+  private synchronized void initialize() {
+    controller = new LocalControllerBase();
+    addComponentFactories();
+    addProcesses();
+  }
+
+  /** Adds the required component factories to a local Carrot2 controller. */
+  private void addComponentFactories() {
+    //  *   <input  component-key="input-localnutch" />
+    LocalComponentFactory nutchInputFactory = new LocalComponentFactoryBase() {
+      public LocalComponent getInstance() {
+        return new LocalNutchInputComponent(defaultLanguage);
+      }
+    };
+    controller.addLocalComponentFactory("input-localnutch", nutchInputFactory);
+
+    // *   <filter component-key="filter-lingo" />
+    LocalComponentFactory lingoFactory = new LocalComponentFactoryBase() {
+      public LocalComponent getInstance() {
+        HashMap defaults = new HashMap();
+
+        // These are adjustments settings for the clustering algorithm.
+        // If you try the live WebStart demo of Carrot2 you can see how they affect
+        // the final clustering: http://www.carrot2.org/webstart 
+        defaults.put("lsi.threshold.clusterAssignment", "0.150");
+        defaults.put("lsi.threshold.candidateCluster",  "0.775");
+
+        // Initialize a new Lingo clustering component.
+        ArrayList languageList = new ArrayList(languages.length);
+        for (int i = 0; i < languages.length; i++) {
+          final String lcode = languages[i];
+          try {
+            Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
+            if (lang == null) {
+              if (logger.isWarnEnabled()) {
+                logger.warn("Language not supported in Carrot2: " + lcode);
+              }
+            } else {
+              languageList.add(lang);
+              if (logger.isDebugEnabled()) {
+                logger.debug("Language loaded: " + lcode);
+              }
+            }
+          } catch (Throwable t) {
+            if (logger.isWarnEnabled()) {
+              logger.warn("Language could not be loaded: " + lcode, t);
+            }
+          }
+        }
+        return new LingoLocalFilterComponent(
+          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
+      }
+    };
+    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
+
+    // *   <output component-key="output-clustersConsumer" />
+    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactoryBase() {
+      public LocalComponent getInstance() {
+        return new ClustersConsumerOutputComponent();
+      }
+    };
+    controller.addLocalComponentFactory("output-clustersConsumer", 
+      clusterConsumerOutputFactory);
+  }
+
+  /** 
+   * Adds a hardcoded clustering process to the local controller.
+   */  
+  private void addProcesses() {
+    LocalProcessBase process = new LocalProcessBase(
+        "input-localnutch",                                   // input
+        "output-clustersConsumer",                            // output
+        new String [] {"filter-lingo"},                       // filters
+        "The Lingo clustering algorithm (www.carrot2.org).",
+        "");
+
+    try {
+      controller.addProcess(PROCESS_ID, process);
+    } catch (Exception e) {
+      throw new RuntimeException("Could not assemble clustering process.", e);
+    }
+  }
+  
+  /**
+   * See {@link OnlineClusterer} for documentation.
+   */
+  public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
+    Map requestParams = new HashMap();
+    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
+      hitDetails);
+    requestParams.put(LocalNutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
+      descriptions);
+
+    try {
+      // The input component takes Nutch's results so we don't need the query argument.
+      final ProcessingResult result = 
+        controller.query(PROCESS_ID, "no-query", requestParams);
+
+      final ClustersConsumerOutputComponent.Result output =
+        (ClustersConsumerOutputComponent.Result) result.getQueryResult();
+
+      final List outputClusters = output.clusters;
+      final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
+
+      int j = 0;
+      for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
+        RawCluster rcluster = (RawCluster) i.next();
+        clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
+      }
+
+      // invoke Carrot2 process here.
+      return clusters;
+    } catch (MissingProcessException e) {
+      throw new RuntimeException("Missing clustering process.", e);
+    } catch (Exception e) {
+      throw new RuntimeException("Unidentified problems with the clustering.", e);
+    }
+  }
+
+  /**
+   * Implementation of {@link Configurable}
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    
+    // Configure default language and other component settings.
+    if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
+      // Change the default language.
+      this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
+    } 
+    if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
+      this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
+    }
+
+    if (logger.isInfoEnabled()) {
+      logger.info("Default language: " + defaultLanguage);
+      logger.info("Enabled languages: " + Arrays.asList(languages));
+    }
+
+    initialize();
+  }
+
+  /**
+   * Implementation of {@link Configurable}
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+}
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java	(revision 449084)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java	(working copy)
@@ -1,112 +1,112 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import java.util.Iterator;
-import java.util.List;
-
-import com.dawidweiss.carrot.core.local.clustering.RawCluster;
-import com.dawidweiss.carrot.core.local.clustering.RawDocument;
-
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.searcher.HitDetails;
-
-/**
- * An adapter of Carrot2's {@link RawCluster} interface to
- * {@link HitsCluster} interface. 
- *
- * @author Dawid Weiss
- * @version $Id: HitsClusterAdapter.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
- */
-public class HitsClusterAdapter implements HitsCluster {
-  private RawCluster rawCluster;
-  private HitDetails [] hits;
-
-  /**
-   * Lazily initialized subclusters array.
-   */
-  private HitsCluster [] subclusters;
-  
-  /**
-   * Lazily initialized documents array.
-   */
-  private HitDetails [] documents;
-  
-  /**
-   * Creates a new adapter.
-   */
-  public HitsClusterAdapter(RawCluster rawCluster, HitDetails [] hits) {
-    this.rawCluster = rawCluster;
-    this.hits = hits;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getSubclusters()
-   */
-  public HitsCluster[] getSubclusters() {
-    if (this.subclusters == null) {
-      List rawSubclusters = rawCluster.getSubclusters();
-      if (rawSubclusters == null || rawSubclusters.size() == 0) {
-        subclusters = null;
-      } else {
-        subclusters = new HitsCluster[rawSubclusters.size()];
-        int j = 0;
-        for (Iterator i = rawSubclusters.iterator(); i.hasNext(); j++) {
-          RawCluster c = (RawCluster) i.next();
-          subclusters[j] = new HitsClusterAdapter(c, hits);
-        }
-      }
-    }
-
-    return subclusters;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getHits()
-   */
-  public HitDetails[] getHits() {
-    if (documents == null) {
-      List rawDocuments = this.rawCluster.getDocuments();
-      documents = new HitDetails[ rawDocuments.size() ];
-      
-      int j = 0;
-      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
-        RawDocument doc = (RawDocument) i.next();
-        Integer offset = (Integer) doc.getId();
-        documents[j] = this.hits[offset.intValue()];
-      }
-    }
-
-    return documents;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getDescriptionLabels()
-   */
-  public String[] getDescriptionLabels() {
-    List phrases = this.rawCluster.getClusterDescription();
-    return (String []) phrases.toArray( new String [ phrases.size() ]);
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#isJunkCluster()
-   */
-  public boolean isJunkCluster() {
-    return rawCluster.getProperty(RawCluster.PROPERTY_JUNK_CLUSTER) != null;
-  }
-}
-
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.clustering.carrot2;
+
+import java.util.Iterator;
+import java.util.List;
+
+import com.dawidweiss.carrot.core.local.clustering.RawCluster;
+import com.dawidweiss.carrot.core.local.clustering.RawDocument;
+
+import org.apache.nutch.clustering.HitsCluster;
+import org.apache.nutch.searcher.HitDetails;
+
+/**
+ * An adapter of Carrot2's {@link RawCluster} interface to
+ * {@link HitsCluster} interface. 
+ *
+ * @author Dawid Weiss
+ * @version $Id: HitsClusterAdapter.java,v 1.1 2004/08/09 23:23:53 johnnx Exp $
+ */
+public class HitsClusterAdapter implements HitsCluster {
+  private RawCluster rawCluster;
+  private HitDetails [] hits;
+
+  /**
+   * Lazily initialized subclusters array.
+   */
+  private HitsCluster [] subclusters;
+  
+  /**
+   * Lazily initialized documents array.
+   */
+  private HitDetails [] documents;
+  
+  /**
+   * Creates a new adapter.
+   */
+  public HitsClusterAdapter(RawCluster rawCluster, HitDetails [] hits) {
+    this.rawCluster = rawCluster;
+    this.hits = hits;
+  }
+
+  /*
+   * @see org.apache.nutch.clustering.HitsCluster#getSubclusters()
+   */
+  public HitsCluster[] getSubclusters() {
+    if (this.subclusters == null) {
+      List rawSubclusters = rawCluster.getSubclusters();
+      if (rawSubclusters == null || rawSubclusters.size() == 0) {
+        subclusters = null;
+      } else {
+        subclusters = new HitsCluster[rawSubclusters.size()];
+        int j = 0;
+        for (Iterator i = rawSubclusters.iterator(); i.hasNext(); j++) {
+          RawCluster c = (RawCluster) i.next();
+          subclusters[j] = new HitsClusterAdapter(c, hits);
+        }
+      }
+    }
+
+    return subclusters;
+  }
+
+  /*
+   * @see org.apache.nutch.clustering.HitsCluster#getHits()
+   */
+  public HitDetails[] getHits() {
+    if (documents == null) {
+      List rawDocuments = this.rawCluster.getDocuments();
+      documents = new HitDetails[ rawDocuments.size() ];
+      
+      int j = 0;
+      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
+        RawDocument doc = (RawDocument) i.next();
+        Integer offset = (Integer) doc.getId();
+        documents[j] = this.hits[offset.intValue()];
+      }
+    }
+
+    return documents;
+  }
+
+  /*
+   * @see org.apache.nutch.clustering.HitsCluster#getDescriptionLabels()
+   */
+  public String[] getDescriptionLabels() {
+    List phrases = this.rawCluster.getClusterDescription();
+    return (String []) phrases.toArray( new String [ phrases.size() ]);
+  }
+
+  /*
+   * @see org.apache.nutch.clustering.HitsCluster#isJunkCluster()
+   */
+  public boolean isJunkCluster() {
+    return rawCluster.getProperty(RawCluster.PROPERTY_JUNK_CLUSTER) != null;
+  }
+}
+
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java	(revision 449084)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java	(working copy)
@@ -24,7 +24,7 @@
 
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -49,7 +49,7 @@
 
   private Configuration conf;
 
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
     
     Metadata metadata = parse.getData().getParseMeta();
Index: src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
===================================================================
--- src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java	(revision 449084)
+++ src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java	(working copy)
@@ -34,7 +34,7 @@
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 
 /**
@@ -72,7 +72,7 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                       .getContent();
     parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
     String text = parse.getText();
Index: src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
===================================================================
--- src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java	(revision 449084)
+++ src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java	(working copy)
@@ -17,7 +17,7 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
@@ -66,7 +66,7 @@
     doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED));
   }
 
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
     String sUrl = url.toString();
     addSubCollectionField(doc, sUrl);
     return doc;
Index: src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
===================================================================
--- src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java	(revision 449084)
+++ src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java	(working copy)
@@ -17,7 +17,7 @@
 package org.apache.nutch.parse.mp3;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
@@ -68,7 +68,7 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + id3v2;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                       .getContent();
     parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
     Metadata metadata = parse.getData().getParseMeta();
@@ -100,7 +100,7 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + id3v1;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                       .getContent();
     parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
 
@@ -127,7 +127,7 @@
     Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + none;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                       .getContent();
     parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
 //    Metadata metadata = parse.getData().getParseMeta();
Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(revision 449084)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(working copy)
@@ -48,7 +48,7 @@
 import org.apache.nutch.util.mime.MimeTypeException;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
@@ -81,7 +81,7 @@
   /** Get the MimeTypes resolver instance. */
   private MimeTypes MIME; 
   
-  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
 
     String url_s = url.toString();
Index: src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
===================================================================
--- src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java	(revision 449084)
+++ src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java	(working copy)
@@ -27,7 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 import junit.framework.TestCase;
@@ -82,7 +82,7 @@
 
     // get nutch content
     Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
-    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
     protocol = null;
   }
 
Index: src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
===================================================================
--- src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java	(revision 449084)
+++ src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java	(working copy)
@@ -37,7 +37,7 @@
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
 
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 
 /**
@@ -107,7 +107,7 @@
 
     System.out.println("Testing file: " + this.urlString + "...");
     this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
-    this.content = this.protocol.getProtocolOutput(new UTF8(this.urlString), new CrawlDatum()).getContent();
+    this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
   }
 
   /**
