Index: conf/common-terms.utf8
===================================================================
--- conf/common-terms.utf8	(revision 959954)
+++ conf/common-terms.utf8	(working copy)
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Common terms and phrases which will be indexed in n-grams
-# in order to optimize search.
-content:a
-content:and
-content:for
-content:in
-content:of
-content:the
-content:to
-url:com
-url:http
-url:http-www
-url:www
Index: conf/log4j.properties
===================================================================
--- conf/log4j.properties	(revision 959954)
+++ conf/log4j.properties	(working copy)
@@ -22,9 +22,6 @@
 log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.Indexer=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexMerger=INFO,cmdstdout
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 959954)
+++ conf/nutch-default.xml	(working copy)
@@ -769,129 +769,6 @@
   </description>
 </property>
 
-
-<!-- analysis properties -->
-
-<property>
-  <name>analysis.common.terms.file</name>
-  <value>common-terms.utf8</value>
-  <description>The name of a file containing a list of common terms
-  that should be indexed in n-grams.</description>
-</property>
-
-<!-- searcher properties -->
-
-<property>
-  <name>searcher.dir</name>
-  <value>crawl</value>
-  <description>
-  Path to root of crawl.  This directory is searched (in
-  order) for either the file search-servers.txt, containing a list of
-  distributed search servers, or the directory "index" containing
-  merged indexes, or the directory "segments" containing segment
-  indexes.
-  </description>
-</property>
-
-<property>
-  <name>searcher.filter.cache.size</name>
-  <value>16</value>
-  <description>
-  Maximum number of filters to cache.  Filters can accelerate certain
-  field-based queries, like language, document format, etc.  Each
-  filter requires one bit of RAM per page.  So, with a 10 million page
-  index, a cache size of 16 consumes two bytes per page, or 20MB.
-  </description>
-</property>
-
-<property>
-  <name>searcher.filter.cache.threshold</name>
-  <value>0.05</value>
-  <description>
-  Filters are cached when their term is matched by more than this
-  fraction of pages.  For example, with a threshold of 0.05, and 10
-  million pages, the term must match more than 1/20, or 50,000 pages.
-  So, if out of 10 million pages, 50% of pages are in English, and 2%
-  are in Finnish, then, with a threshold of 0.05, searches for
-  "lang:en" will use a cached filter, while searches for "lang:fi"
-  will score all 20,000 finnish documents.
-  </description>
-</property>
-
-<property>
-  <name>searcher.hostgrouping.rawhits.factor</name>
-  <value>2.0</value>
-  <description>
-  A factor that is used to determine the number of raw hits
-  initially fetched, before host grouping is done.
-  </description>
-</property>
-
-<property>
-  <name>searcher.summary.context</name>
-  <value>5</value>
-  <description>
-  The number of context terms to display preceding and following
-  matching terms in a hit summary.
-  </description>
-</property>
-
-<property>
-  <name>searcher.summary.length</name>
-  <value>20</value>
-  <description>
-  The total number of terms to display in a hit summary.
-  </description>
-</property>
-
-<property>
-  <name>searcher.max.hits</name>
-  <value>-1</value>
-  <description>If positive, search stops after this many hits are
-  found.  Setting this to small, positive values (e.g., 1000) can make
-  searches much faster.  With a sorted index, the quality of the hits
-  suffers little.</description>
-</property>
-
-<property>
-  <name>searcher.max.time.tick_count</name>
-  <value>-1</value>
-  <description>If positive value is defined here, limit search time for
-  every request to this number of elapsed ticks (see the tick_length
-  property below). The total maximum time for any search request will be
-  then limited to tick_count * tick_length milliseconds. When search time
-  is exceeded, partial results will be returned, and the total number of
-  hits will be estimated.
-  </description>
-</property>
-
-<property>
-  <name>searcher.max.time.tick_length</name>
-  <value>200</value>
-  <description>The number of milliseconds between ticks. Larger values
-  reduce the timer granularity (precision). Smaller values bring more
-  overhead.
-  </description>
-</property>
-
-<property>
-  <name>searcher.num.handlers</name>
-  <value>10</value>
-  <description>The number of handlers for the distributed search server.
-  </description>
-</property>
-
-<property>
-  <name>searcher.max.hits.per.page</name>
-  <value>1000</value>
-  <description> The maximum number of hits to show per page. -1 if
-    unlimited. If the number of hits requested by the user (via
-    hitsPerPage parameter in the query string) is more than the value
-    specified in this property, then this value is assumed as the number
-    of hits per page.
-  </description>
-</property>
-
 <!-- URL normalizer properties -->
 
 <property>
@@ -956,7 +833,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-tika|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  <value>protocol-http|urlfilter-regex|parse-tika|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
@@ -1115,137 +992,6 @@
   </description>
 </property>
 
-<!-- clustering extension properties -->
-
-<property>
-  <name>extension.clustering.hits-to-cluster</name>
-  <value>100</value>
-  <description>Number of snippets retrieved for the clustering extension
-  if clustering extension is available and user requested results
-  to be clustered.</description>
-</property>
-
-<property>
-  <name>extension.clustering.extension-name</name>
-  <value></value>
-  <description>Use the specified online clustering extension. If empty,
-  the first available extension will be used. The "name" here refers to an 'id'
-  attribute of the 'implementation' element in the plugin descriptor XML
-  file.</description>
-</property>
-
-<!-- ontology extension properties -->
-
-<property>
-  <name>extension.ontology.extension-name</name>
-  <value></value>
-  <description>Use the specified online ontology extension. If empty,
-  the first available extension will be used. The "name" here refers to an 'id'
-  attribute of the 'implementation' element in the plugin descriptor XML
-  file.</description>
-</property>
-
-<property>
-  <name>extension.ontology.urls</name>
-  <value>
-  </value>
-  <description>Urls of owl files, separated by spaces, such as
-  http://www.example.com/ontology/time.owl
-  http://www.example.com/ontology/space.owl
-  http://www.example.com/ontology/wine.owl
-  Or
-  file:/ontology/time.owl
-  file:/ontology/space.owl
-  file:/ontology/wine.owl
-  You have to make sure each url is valid.
-  By default, there is no owl file, so query refinement based on ontology
-  is silently ignored.
-  </description>
-</property>
-
-<!-- query-basic plugin properties -->
-
-<property>
-  <name>query.url.boost</name>
-  <value>4.0</value>
-  <description> Used as a boost for url field in Lucene query.
-  </description>
-</property>
-
-<property>
-  <name>query.anchor.boost</name>
-  <value>2.0</value>
-  <description> Used as a boost for anchor field in Lucene query.
-  </description>
-</property>
-
-<property>
-  <name>query.title.boost</name>
-  <value>1.5</value>
-  <description> Used as a boost for title field in Lucene query.
-  </description>
-</property>
-
-<property>
-  <name>query.host.boost</name>
-  <value>2.0</value>
-  <description> Used as a boost for host field in Lucene query.
-  </description>
-</property>
-
-<property>
-  <name>query.phrase.boost</name>
-  <value>1.0</value>
-  <description> Used as a boost for phrase in Lucene query.
-  Multiplied by boost for field phrase is matched in.
-  </description>
-</property>
-
-<!--
-<property>
-  <name>query.basic.description.boost</name>
-  <value>1.0</value>
-  <description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
-  </description>
-</property>
--->
-
-<!-- creative-commons plugin properties -->
-
-<property>
-  <name>query.cc.boost</name>
-  <value>0.0</value>
-  <description> Used as a boost for cc field in Lucene query.
-  </description>
-</property>
-
-<!-- query-more plugin properties -->
-
-<property>
-  <name>query.type.boost</name>
-  <value>0.0</value>
-  <description> Used as a boost for type field in Lucene query.
-  </description>
-</property>
-
-<!-- query-site plugin properties -->
-
-<property>
-  <name>query.site.boost</name>
-  <value>0.0</value>
-  <description> Used as a boost for site field in Lucene query.
-  </description>
-</property>
-
-<!-- microformats-reltag plugin properties -->
-
-<property>
-  <name>query.tag.boost</name>
-  <value>1.0</value>
-  <description> Used as a boost for tag field in Lucene query.
-  </description>
-</property>
-
 <!-- language-identifier plugin properties -->
 
 <property>
@@ -1280,13 +1026,6 @@
   </description>
 </property>
 
-<property>
-  <name>query.lang.boost</name>
-  <value>0.0</value>
-  <description> Used as a boost for lang field in Lucene query.
-  </description>
-</property>
-
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>
@@ -1300,65 +1039,6 @@
   </description>
 </property>
 
-<!-- response writer properties -->
-
-<property>
-  <name>search.response.default.type</name>
-  <value>xml</value>
-  <description>
-  The default response type returned if none is specified.
-  </description>
-</property>
-
-<property>
-  <name>search.response.default.lang</name>
-  <value>en</value>
-  <description>
-  The default response language if none is specified.
-  </description>
-</property>
-
-<property>
-  <name>search.response.default.numrows</name>
-  <value>10</value>
-  <description>
-  The default number of rows to return if none is specified.
-  </description>
-</property>
-
-<property>
-  <name>search.response.default.dedupfield</name>
-  <value>site</value>
-  <description>
-  The default dedup field if none is specified.
-  </description>
-</property>
-
-<property>
-  <name>search.response.default.numdupes</name>
-  <value>1</value>
-  <description>
-  The default number of duplicates returned if none is specified.
-  </description>
-</property>
-
-<property>
-  <name>searcher.response.maxage</name>
-  <value>86400</value>
-  <description>
-  The maxage of a response in seconds. Used in caching headers.
-  </description>
-</property>
-
-<property>
-  <name>searcher.response.prettyprint</name>
-  <value>true</value>
-  <description>
-  Should the response output be pretty printed.  Setting to true enables better
-  debugging, false removes unneeded spaces and gives better throughput.
-  </description>
-</property>
-
 <!-- solr index properties -->
 <property>
   <name>solrindex.mapping.file</name>
Index: conf/custom-fields.xml
===================================================================
--- conf/custom-fields.xml	(revision 959954)
+++ conf/custom-fields.xml	(working copy)
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
-<properties>
-  <!--<entry key="field.name">lang</entry>
-  <entry key="field.indexed">yes</entry>
-  <entry key="field.stored">yes</entry>
-  <entry key="field.tokenized">no</entry>
-  <entry key="field.boost">1.0</entry>
-  <entry key="field.multi">false</entry>-->
-</properties>
\ No newline at end of file
Index: src/test/org/apache/nutch/clustering/TestOnlineClustererFactory.java
===================================================================
--- src/test/org/apache/nutch/clustering/TestOnlineClustererFactory.java	(revision 959954)
+++ src/test/org/apache/nutch/clustering/TestOnlineClustererFactory.java	(working copy)
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.clustering;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestOnlineClustererFactory extends TestCase {
-
-  private Configuration conf;
-  
-  protected void setUp() throws Exception {
-      conf = NutchConfiguration.create();
-      conf.set("plugin.includes", ".*");
-  }
-  
-  public void testFacotyr(){
-    OnlineClustererFactory factory = new OnlineClustererFactory(conf);
-    
-    try{
-      OnlineClusterer clusterer1=factory.getOnlineClusterer();
-      OnlineClusterer clusterer2=factory.getOnlineClusterer();
-      assertNotNull(clusterer1);
-      assertNotNull(clusterer2);
-      
-      //Current implementation creates new object instance in every call
-      //TODO: check if this is required  
-      assertNotSame(clusterer1, clusterer2);
-    } catch (PluginRuntimeException pre) {
-      fail("Should not throw Exception:" + pre);
-    }
-  }
-}
Index: src/test/org/apache/nutch/analysis/TestQueryParser.java
===================================================================
--- src/test/org/apache/nutch/analysis/TestQueryParser.java	(revision 959954)
+++ src/test/org/apache/nutch/analysis/TestQueryParser.java	(working copy)
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis;
-
-import org.apache.nutch.searcher.Query;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-/**
- * JUnit tests for query parser
- *  
- */
-public class TestQueryParser extends TestCase {
-
-  private static Configuration conf = NutchConfiguration.create();
-  public void assertQueryEquals(String query, String result) throws Exception {
-    try {
-      Query q = NutchAnalysis.parseQuery(query, conf);
-      String s = q.toString();
-      if (!s.equals(result)) {
-        fail("Query /" + query + "/ yielded /" + s + "/, expecting /" + result
-            + "/");
-      }
-    } catch (Exception e) {
-      throw new Exception("error: While parsing query:" + query, e);
-    }
-  }
-
-  /**
-   * Test query parser
-   * 
-   * @throws Exception
-   */
-  public void testParseQuery() throws Exception {
-    //simple tests
-    assertQueryEquals("x", "x");
-    assertQueryEquals("X", "x");
-    assertQueryEquals("+x", "x");
-    assertQueryEquals("-x", "-x");
-    assertQueryEquals("x y", "x y");
-    assertQueryEquals(" x  y ", "x y");
-    assertQueryEquals("test +", "test");
-
-    // missing fourth double quote
-    assertQueryEquals("\" abc def \" \" def ghi ", "\"abc def\" \"def ghi\"");
-
-    //empty query
-    assertQueryEquals("\"", "");
-
-    //fields
-    assertQueryEquals("field:x -another:y", "field:x -another:y");
-    assertQueryEquals("the:x", "the:x");
-
-    //ACRONYM
-    assertQueryEquals("w.s.o.p.", "wsop");
-
-    //STOPWORD
-    assertQueryEquals("the", "");
-    assertQueryEquals("field:the -y", "field:the -y");
-    assertQueryEquals("\"the y\"", "\"the y\"");
-    assertQueryEquals("+the -y", "the -y");
-
-    //PHRASE
-    assertQueryEquals("\"hello world\"", "\"hello world\"");
-    assertQueryEquals("\"phrase a.b.c. phrase\"", "\"phrase abc phrase\"");
-    assertQueryEquals("\"the end\"", "\"the end\"");
-    assertQueryEquals("term\"the end\"", "term \"the end\"");
-    //unbalanced
-    assertQueryEquals("term\"the end", "term \"the end\"");
-
-    //SIGRAM
-    assertQueryEquals("\u3040\u3041\u3042", "\u3040 \u3041 \u3042");
-
-    //COMPOUND
-    assertQueryEquals("term some.email@adress.here",
-        "term \"some email adress here\"");
-  }
-}
Index: src/test/org/apache/nutch/analysis/TestAnalyzerFactory.java
===================================================================
--- src/test/org/apache/nutch/analysis/TestAnalyzerFactory.java	(revision 959954)
+++ src/test/org/apache/nutch/analysis/TestAnalyzerFactory.java	(working copy)
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.analysis;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-/**
- * Simple test case to verify AnalyzerFactory functionality
- */
-public class TestAnalyzerFactory extends TestCase {
-
-  private Configuration conf;
-  private AnalyzerFactory factory;
-  
-  protected void setUp() throws Exception {
-      conf = NutchConfiguration.create();
-      conf.set("plugin.includes", ".*");
-      factory=AnalyzerFactory.get(conf);
-  }
-  
-  public void testGetNull() {
-    NutchAnalyzer analyzer=factory.get((String)null);
-    assertSame(analyzer, factory.getDefault());
-  }
-
-  public void testGetExisting() {
-    NutchAnalyzer analyzer=factory.get("en");
-    assertNotNull(analyzer);
-  }
-
-  public void testGetNonExisting() {
-    NutchAnalyzer analyzer=factory.get("imaginary-non-existing-language");
-    assertSame(analyzer, factory.getDefault());
-  }
-
-  public void testCaching() {
-    NutchAnalyzer analyzer1=factory.get("en");
-    NutchAnalyzer analyzer2=factory.get("en");
-    assertEquals(analyzer1, analyzer2);
-  }
-
-}
Index: src/test/org/apache/nutch/searcher/TestQuery.java
===================================================================
--- src/test/org/apache/nutch/searcher/TestQuery.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/TestQuery.java	(working copy)
@@ -1,113 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.*;
-import junit.framework.TestCase;
-import java.util.Arrays;
-import org.apache.nutch.analysis.NutchAnalysis;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-public class TestQuery extends TestCase {
-    
-  private static Configuration conf = NutchConfiguration.create();
-    
-  public TestQuery(String name) { super(name); }
-
-  public void testRequiredTerm() throws Exception {
-    Query query = new Query(conf);
-    query.addRequiredTerm("bobo");
-    testQuery(query, "bobo");
-  }
-
-  public void testProhibitedTerm() throws Exception {
-    Query query = new Query(conf);
-    query.addProhibitedTerm("bobo");
-    testQuery(query, "-bobo");
-  }
-
-  public void testRequiredPhrase() throws Exception {
-    Query query = new Query(conf);
-    query.addRequiredPhrase(new String[] {"bobo", "bogo"});
-    testQuery(query, "\"bobo bogo\"");
-  }
-
-  public void testProhibitedPhrase() throws Exception {
-    Query query = new Query(conf);
-    query.addProhibitedPhrase(new String[] {"bobo", "bogo"});
-    testQuery(query, "-\"bobo bogo\"");
-  }
-
-  public void testComplex() throws Exception {
-    Query query = new Query(conf);
-    query.addRequiredTerm("bobo");
-    query.addProhibitedTerm("bono");
-    query.addRequiredPhrase(new String[] {"bobo", "bogo"});
-    query.addProhibitedPhrase(new String[] {"bogo", "bobo"});
-    testQuery(query, "bobo -bono \"bobo bogo\" -\"bogo bobo\"");
-  }
-
-  public static void testQuery(Query query, String string) throws Exception {
-    testQueryToString(query, string);
-    testQueryParser(query, string);
-    testQueryIO(query, string);
-  }
-
-  public static void testQueryToString(Query query, String string) {
-    assertEquals(query.toString(), string);
-  }
-
-  public static void testQueryParser(Query query, String string)
-    throws Exception {
-    Query after = NutchAnalysis.parseQuery(string, conf);
-    assertEquals(after, query);
-    assertEquals(after.toString(), string);
-  }
-
-  public static void testQueryIO(Query query, String string) throws Exception {
-    ByteArrayOutputStream oBuf = new ByteArrayOutputStream();
-    DataOutputStream out = new DataOutputStream(oBuf);
-    query.write(out);
-
-    ByteArrayInputStream iBuf = new ByteArrayInputStream(oBuf.toByteArray());
-    DataInputStream in = new DataInputStream(iBuf);
-
-    Query after = Query.read(in, conf);
-
-    assertEquals(after, query);
-  }
-
-  public void testQueryTerms() throws Exception {
-    testQueryTerms("foo bar", new String[] {"foo", "bar"});
-    testQueryTerms("\"foo bar\"", new String[] {"foo", "bar"});
-    testQueryTerms("\"foo bar\" baz", new String[] {"foo", "bar", "baz"});
-  }
-
-  public static void testQueryTerms(String query, String[] terms)
-    throws Exception {
-    assertTrue(Arrays.equals(NutchAnalysis.parseQuery(query, conf).getTerms(),
-                             terms));
-  }
-
-  public static void main(String[] args) throws Exception {
-    TestQuery test = new TestQuery("test");
-    test.testComplex();
-  }
-
-}
Index: src/test/org/apache/nutch/searcher/TestSummarizerFactory.java
===================================================================
--- src/test/org/apache/nutch/searcher/TestSummarizerFactory.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/TestSummarizerFactory.java	(working copy)
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestSummarizerFactory extends TestCase {
-  
-  private Configuration conf;
-  
-  protected void setUp() throws Exception {
-      conf = NutchConfiguration.create();
-      conf.set("plugin.includes", ".*");
-  }
-
-  public void testGetSummarizer(){
-    SummarizerFactory factory=new SummarizerFactory(conf);
-    Summarizer summarizer1=factory.getSummarizer();
-    Summarizer summarizer2=factory.getSummarizer();
-    
-    assertNotNull(summarizer1);
-    assertEquals(summarizer1, summarizer2);
-  }
-}
Index: src/test/org/apache/nutch/searcher/QueryParamsTest.java
===================================================================
--- src/test/org/apache/nutch/searcher/QueryParamsTest.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/QueryParamsTest.java	(working copy)
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import junit.framework.TestCase;
-
-import org.apache.nutch.util.WritableTestUtils;
-
-public class QueryParamsTest extends TestCase {
-
-  public void testWritable() throws Exception {
-    QueryParams context = new QueryParams(10, 2, "site", null, false);
-    context.put("cat", "dog");
-    WritableTestUtils.testWritable(context);
-  }
-
-}
Index: src/test/org/apache/nutch/searcher/TestHitDetails.java
===================================================================
--- src/test/org/apache/nutch/searcher/TestHitDetails.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/TestHitDetails.java	(working copy)
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import org.apache.hadoop.io.*;
-import junit.framework.TestCase;
-
-public class TestHitDetails extends TestCase {
-  public TestHitDetails(String name) { super(name); }
-
-  public void testHitDetails() throws Exception {
-    final int length = 4;
-    final String[] fields = new String[] {"a", "b", "c", "a" };
-    final String[] values = new String[] { "foo1", "bar", "baz", "foo2" };
-
-    HitDetails before = new HitDetails(fields, values);
-
-    DataOutputBuffer dob = new DataOutputBuffer();
-    before.write(dob);
-
-    DataInputBuffer dib = new DataInputBuffer();
-    dib.reset(dob.getData(), dob.getLength());
-
-    HitDetails after = HitDetails.read(dib);
-
-    assertEquals(length, after.getLength());
-    for (int i = 0; i < 3; i++) {
-      assertEquals(fields[i], after.getField(i));
-      assertEquals(values[i], after.getValue(i));
-      assertEquals(values[i], after.getValue(fields[i]));
-    }
-    String[] vals = after.getValues("a");
-    assertEquals(2, vals.length);
-    assertEquals("foo1", vals[0]);
-    assertEquals("foo2", vals[1]);
-    vals = after.getValues("b");
-    assertEquals(1, vals.length);
-    assertEquals("bar", vals[0]);
-    vals = after.getValues("c");
-    assertEquals(1, vals.length);
-    assertEquals("baz", vals[0]);
-  }
-}
Index: src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
===================================================================
--- src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java	(working copy)
@@ -1,33 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-package org.apache.nutch.searcher;
-
-import junit.framework.TestCase;
-
-public class TestOpenSearchServlet extends TestCase {
-
-  /**
-   * Test removing of illegal xml chars from string
-   */
-  public void testGetLegalXml(){
-    assertEquals("hello",OpenSearchServlet.getLegalXml("hello"));
-    assertEquals("hello",OpenSearchServlet.getLegalXml("he\u0000llo"));
-    assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo"));
-    assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo\u0000"));
-  }
-  
-}
Index: src/test/org/apache/nutch/searcher/TestSummary.java
===================================================================
--- src/test/org/apache/nutch/searcher/TestSummary.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/TestSummary.java	(working copy)
@@ -1,171 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-// JUnit imports
-import junit.framework.Test;
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-
-// Nutch imports
-import org.apache.nutch.searcher.Summary.Ellipsis;
-import org.apache.nutch.searcher.Summary.Fragment;
-import org.apache.nutch.searcher.Summary.Highlight;
-import org.apache.nutch.util.WritableTestUtils;
-
-
-/**
- * JUnit based test of class {@link Summary}.
- *
- * @author J&eacute;r&ocirc;me Charron
- */
-public class TestSummary extends TestCase {
-  
-  public TestSummary(String testName) {
-    super(testName);
-  }
-
-  public static Test suite() {
-    return new TestSuite(TestSummary.class);
-  }
-  
-
-  /** Test of <code>Fragment</code> inner class */
-  public void testFragment() {
-    Fragment fragment = new Fragment("fragment text");
-    assertEquals("fragment text", fragment.getText());
-    assertEquals("fragment text", fragment.toString());
-    assertFalse(fragment.isEllipsis());
-    assertFalse(fragment.isHighlight());
-    assertTrue(fragment.equals(new Fragment("fragment text")));
-    assertFalse(fragment.equals(new Fragment("some text")));
-    assertFalse(fragment.equals(new Ellipsis()));
-    assertFalse(fragment.equals(new Highlight("fragment text")));
-  }
-
-  /** Test of <code>Ellipsis</code> inner class */
-  public void testEllipsis() {
-    Fragment fragment = new Ellipsis();
-    assertEquals(" ... ", fragment.getText());
-    assertEquals(" ... ", fragment.toString());
-    assertTrue(fragment.isEllipsis());
-    assertFalse(fragment.isHighlight());
-    assertFalse(fragment.equals(new Fragment("fragment text")));
-    assertTrue(fragment.equals(new Ellipsis()));
-    assertFalse(fragment.equals(new Highlight("fragment text")));
-  }
-
-  /** Test of <code>Highlight</code> inner class */
-  public void testHighlight() {
-    Fragment fragment = new Highlight("highlight text");
-    assertEquals("highlight text", fragment.getText());
-    assertEquals("highlight text", fragment.toString());
-    assertFalse(fragment.isEllipsis());
-    assertTrue(fragment.isHighlight());
-    assertFalse(fragment.equals(new Fragment("fragment text")));
-    assertFalse(fragment.equals(new Ellipsis()));
-    assertFalse(fragment.equals(new Highlight("fragment text")));
-    assertTrue(fragment.equals(new Highlight("highlight text")));
-  }
-
-  /** Test of <code>add</code> / <code>get</code> methods */
-  public void testAdd() {
-    Fragment[] fragments = null;
-    Summary summary = new Summary();
-    summary.add(new Fragment("fragment1"));
-    fragments = summary.getFragments();
-    assertEquals(1, fragments.length);
-    assertEquals("fragment1", fragments[0].toString());
-    summary.add(new Fragment("fragment2"));
-    fragments = summary.getFragments();
-    assertEquals(2, fragments.length);
-    assertEquals("fragment1", fragments[0].toString());
-    assertEquals("fragment2", fragments[1].toString());
-    summary.add(new Fragment("fragment3"));
-    fragments = summary.getFragments();
-    assertEquals(3, fragments.length);
-    assertEquals("fragment1", fragments[0].toString());
-    assertEquals("fragment2", fragments[1].toString());
-    assertEquals("fragment3", fragments[2].toString());
-  }
-
-  /** Test of <code>toString</code> method. */
-  public void testToString() {
-    Summary summary = new Summary();
-    assertEquals("", summary.toString());
-    summary.add(new Fragment("fragment1"));
-    assertEquals("fragment1", summary.toString());
-    summary.add(new Ellipsis());
-    assertEquals("fragment1 ... ", summary.toString());
-    summary.add(new Highlight("highlight"));
-    assertEquals("fragment1 ... highlight", summary.toString());
-    summary.add(new Fragment("fragment2"));
-    assertEquals("fragment1 ... highlightfragment2", summary.toString());    
-  }
-
-  /** Test of <code>toStrings</code>. */
-  public void testToStrings() {
-    Summary[] summaries = { new Summary(), new Summary() };
-    summaries[0].add(new Fragment("fragment1.1"));
-    summaries[0].add(new Ellipsis());
-    summaries[0].add(new Highlight("highlight1"));
-    summaries[0].add(new Fragment("fragment1.2"));
-    summaries[1].add(new Fragment("fragment2.1"));
-    summaries[1].add(new Ellipsis());
-    summaries[1].add(new Highlight("highlight2"));
-    summaries[1].add(new Fragment("fragment2.2"));
-    String[] strings = Summary.toStrings(summaries);
-    assertEquals(2, strings.length);
-    assertEquals("fragment1.1 ... highlight1fragment1.2", strings[0]);
-    assertEquals("fragment2.1 ... highlight2fragment2.2", strings[1]);
-  }
-
-  /** Test of <code>equals</code> method. */
-  public void testEquals() {
-    Summary summary1 = new Summary();
-    Summary summary2 = new Summary();
-    assertFalse(summary1.equals(null));
-    assertFalse(summary1.equals(""));
-    assertTrue(summary1.equals(summary2));
-    summary1.add(new Fragment("text fragment"));
-    assertFalse(summary1.equals(summary2));
-    summary2.add(new Fragment("text fragment"));
-    assertTrue(summary1.equals(summary2));
-    summary1.add(new Ellipsis());
-    assertFalse(summary1.equals(summary2));
-    summary2.add(new Ellipsis());
-    assertTrue(summary1.equals(summary2));
-    summary1.add(new Highlight("highlight"));
-    assertFalse(summary1.equals(summary2));
-    summary2.add(new Highlight("highlight"));
-    assertTrue(summary1.equals(summary2));
-    summary1.add(new Fragment("text fragment"));
-    summary2.add(new Fragment("fragment text"));
-    assertFalse(summary1.equals(summary2));
-  }
-  
-  /** Test of <code>writable</code> implementation. */
-  public void testWritable() throws Exception {
-    Summary summary = new Summary();
-    summary.add(new Fragment("fragment1.1"));
-    summary.add(new Ellipsis());
-    summary.add(new Highlight("highlight1"));
-    summary.add(new Fragment("fragment1.2"));
-    WritableTestUtils.testWritable(summary);
-  }
-  
-}
Index: src/test/org/apache/nutch/searcher/response/TestRequestUtils.java
===================================================================
--- src/test/org/apache/nutch/searcher/response/TestRequestUtils.java	(revision 959954)
+++ src/test/org/apache/nutch/searcher/response/TestRequestUtils.java	(working copy)
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher.response;
-
-import java.lang.reflect.InvocationHandler;
-import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
-import java.util.HashMap;
-import java.util.Map;
-
-import javax.servlet.http.HttpServletRequest;
-
-import junit.framework.TestCase;
-
-public class TestRequestUtils extends TestCase {
-
-  public TestRequestUtils(String name) {
-    super(name);
-  }
-
-  /**
-   * Test getBooleanParameter() - no default
-   */
-  public void testGetBooleanParameterNoDefault() {
-    String param = "foo";
-    Map parameters = new HashMap();
-    HttpServletRequest request = createMockHttpServletRequest(parameters);
-
-    assertFalse("No param", RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "0");
-    assertFalse("Foo=0",    RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "no");
-    assertFalse("Foo=no",  RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "false");
-    assertFalse("Foo=false", RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "abcdef");
-    assertFalse("Foo=abcdef", RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "1");
-    assertTrue("Foo=1",    RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "yes");
-    assertTrue("Foo=yes",  RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "YES");
-    assertTrue("Foo=YES",  RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "true");
-    assertTrue("Foo=true", RequestUtils.getBooleanParameter(request, param));
-
-    parameters.put(param, "TRUE");
-    assertTrue("Foo=TRUE", RequestUtils.getBooleanParameter(request, param));
-  }
-
-  /**
-   * Test getBooleanParameter() - with default
-   */
-  public void testGetBooleanParameterWithoDefault() {
-    String param = "foo";
-    Map parameters = new HashMap();
-    HttpServletRequest request = createMockHttpServletRequest(parameters);
-
-    assertTrue("No param - def true", RequestUtils.getBooleanParameter(request, param, true));
-    assertFalse("No param - def false", RequestUtils.getBooleanParameter(request, param, false));
-
-    parameters.put(param, "0");
-    assertFalse("Foo=0",    RequestUtils.getBooleanParameter(request, param, true));
-
-    parameters.put(param, "no");
-    assertFalse("Foo=no",  RequestUtils.getBooleanParameter(request, param, true));
-
-    parameters.put(param, "false");
-    assertFalse("Foo=false", RequestUtils.getBooleanParameter(request, param, true));
-
-    parameters.put(param, "abcdef");
-    assertFalse("Foo=abcdef", RequestUtils.getBooleanParameter(request, param, true));
-
-    parameters.put(param, "1");
-    assertTrue("Foo=1",    RequestUtils.getBooleanParameter(request, param, false));
-
-    parameters.put(param, "yes");
-    assertTrue("Foo=yes",  RequestUtils.getBooleanParameter(request, param, false));
-
-    parameters.put(param, "YES");
-    assertTrue("Foo=YES",  RequestUtils.getBooleanParameter(request, param, false));
-
-    parameters.put(param, "true");
-    assertTrue("Foo=true", RequestUtils.getBooleanParameter(request, param, false));
-
-    parameters.put(param, "TRUE");
-    assertTrue("Foo=TRUE", RequestUtils.getBooleanParameter(request, param, false));
-  }
-
-  /**
-   * Create a mock HttpServletRequest.
-   */
-  private HttpServletRequest createMockHttpServletRequest(Map parameters) {
-    MockHttpServletRequestInvocationHandler handler = new MockHttpServletRequestInvocationHandler();
-    handler.setParameterMap(parameters);
-    ClassLoader cl = getClass().getClassLoader();
-    Class[] interfaces = new Class[] {HttpServletRequest.class};
-    return (HttpServletRequest)Proxy.newProxyInstance(cl, interfaces, handler);
-  }
-
-  /**
-   * InvocationHandler for mock HttpServletRequest proxy.
-   */
-  private static class MockHttpServletRequestInvocationHandler implements InvocationHandler {
-    private Map parameters = new HashMap();
-    public Object invoke(Object proxy, Method method, Object[] args)
-            throws Throwable {
-      if (method.getName().equals("getParameter")) {
-        return parameters.get((String)args[0]);
-      }
-      return null;
-    }
-    public void setParameterMap(Map parameters) {
-      this.parameters = parameters;
-    }
-  }
-}
\ No newline at end of file
Index: src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
===================================================================
--- src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java	(revision 959954)
+++ src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java	(working copy)
@@ -1,204 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-package org.apache.nutch.indexer;
-
-import java.io.File;
-import java.util.Random;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.MD5Hash;
-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.DateTools.Resolution;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestDeleteDuplicates extends TestCase {
-  Configuration conf;
-  FileSystem fs;
-  Path root;
-  Path index1;
-  Path index2;
-  Path index3;
-  Path index4;
-  Path index5;
-  
-  public void setUp() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.set("fs.default.name", "file:///");
-    fs = FileSystem.get(conf);
-    root = new Path("build/test/dedup2-test-" + new Random().nextInt());
-    // create test indexes
-    index1 = createIndex("index1", true, 1.0f, 10L, false);
-    index2 = createIndex("index2", false, 2.0f, 20L, true);
-    index3 = createIndex("index3", true, 1.0f, 10L, true);
-    index4 = createSingleDocIndex("index4", 1.0f, 10L);
-    index5 = createSingleDocIndex("index5", 1.0f, 20L);
-  }
-  
-  private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception {
-    Path idx = new Path(root, name);
-    Path sub = new Path(idx, "part-0000");
-    Directory dir = FSDirectory.open(new File(sub.toString()));
-    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, 
-      MaxFieldLength.UNLIMITED);
-    Document doc = makeDoc(name,
-        MD5Hash.digest("1").toString(),
-        "http://www.example.com/1",
-        1.0f + (incFirst ? inc : 0.0f), time);
-    writer.addDocument(doc);
-    if (hashDup) {
-      doc = makeDoc(name,
-          MD5Hash.digest("1").toString(),
-          "http://www.example.com/2",
-          1.0f + (!incFirst ? inc : 0.0f), time + 1);
-    } else {
-      doc = makeDoc(name,
-          MD5Hash.digest("2").toString(),
-          "http://www.example.com/1",
-          1.0f + (!incFirst ? inc : 0.0f), time + 1);
-    }
-    writer.addDocument(doc);
-    writer.close();
-    return idx;
-  }
-  
-  private Path createSingleDocIndex(String name, float inc, long time) throws Exception {
-    Path idx = new Path(root, name);
-    Path sub = new Path(idx, "part-0000");
-    Directory dir = FSDirectory.open(new File(sub.toString()));
-    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true, 
-      MaxFieldLength.UNLIMITED);
-    Document doc = makeDoc(name,
-        MD5Hash.digest("1").toString(),
-        "http://www.example.com/1",
-       1.0f + inc, time + 1);
-    writer.addDocument(doc);
-    writer.close();
-    return idx;
-  }
-  
-  private Document makeDoc(String segment, String digest, String url, float boost, long time) {
-    Document doc = new Document();
-    doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
-    doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO));
-    doc.add(new Field("url", url, Field.Store.YES, Field.Index.ANALYZED));
-    doc.setBoost(boost);
-    doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO));
-    doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO));
-    return doc;
-  }
-  
-  public void tearDown() throws Exception {
-    fs.delete(root, true);
-  }
-
-  private void hashDuplicatesHelper(Path index, String url) throws Exception {
-    DeleteDuplicates dedup = new DeleteDuplicates(conf);
-    dedup.dedup(new Path[]{index});
-    FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf);
-    IndexReader reader = IndexReader.open(dir);
-    assertEquals("only one doc left", reader.numDocs(), 1);
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (reader.isDeleted(i)) {
-        System.out.println("-doc " + i + " deleted");
-        continue;
-      }
-      Document doc = reader.document(i);
-      // make sure we got the right one
-      assertEquals("check url", url, doc.get("url"));
-      System.out.println(doc);
-    }
-    reader.close();
-  }
-  
-  public void testHashDuplicates() throws Exception {
-    hashDuplicatesHelper(index1, "http://www.example.com/2");
-    hashDuplicatesHelper(index3, "http://www.example.com/1");
-  }
-  
-  public void testUrlDuplicates() throws Exception {
-    DeleteDuplicates dedup = new DeleteDuplicates(conf);
-    dedup.dedup(new Path[]{index2});
-    FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
-    IndexReader reader = IndexReader.open(dir);
-    assertEquals("only one doc left", reader.numDocs(), 1);
-    MD5Hash hash = MD5Hash.digest("2");
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (reader.isDeleted(i)) {
-        System.out.println("-doc " + i + " deleted");
-        continue;
-      }
-      Document doc = reader.document(i);
-      // make sure we got the right one
-      assertEquals("check hash", hash.toString(), doc.get("digest"));
-      System.out.println(doc);
-    }
-    reader.close();
-  }
-  
-  public void testMixedDuplicates() throws Exception {
-    DeleteDuplicates dedup = new DeleteDuplicates(conf);
-    dedup.dedup(new Path[]{index1, index2});
-    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
-    IndexReader reader = IndexReader.open(dir);
-    assertEquals("only one doc left", reader.numDocs(), 1);
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (reader.isDeleted(i)) {
-        System.out.println("-doc " + i + " deleted");
-        continue;
-      }
-      Document doc = reader.document(i);
-      // make sure we got the right one
-      assertEquals("check url", "http://www.example.com/2", doc.get("url"));
-      System.out.println(doc);
-    }
-    reader.close();
-    dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
-    reader = IndexReader.open(dir);
-    assertEquals("only one doc left", reader.numDocs(), 1);
-    MD5Hash hash = MD5Hash.digest("2");
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      if (reader.isDeleted(i)) {
-        System.out.println("-doc " + i + " deleted");
-        continue;
-      }
-      Document doc = reader.document(i);
-      // make sure we got the right one
-      assertEquals("check hash", hash.toString(), doc.get("digest"));
-      System.out.println(doc);
-    }
-    reader.close();
-  }
-  
-  public void testRededuplicate() throws Exception {
-    DeleteDuplicates dedup = new DeleteDuplicates(conf);
-    dedup.dedup(new Path[]{index4, index5});
-    dedup.dedup(new Path[]{index4, index5});
-  }
-  
-}
Index: src/test/org/apache/nutch/indexer/TestIndexSorter.java
===================================================================
--- src/test/org/apache/nutch/indexer/TestIndexSorter.java	(revision 959954)
+++ src/test/org/apache/nutch/indexer/TestIndexSorter.java	(working copy)
@@ -1,148 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.File;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Field.Index;
-import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.search.Similarity;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestIndexSorter extends TestCase {
-  private static final Log LOG = LogFactory.getLog(TestIndexSorter.class);
-  
-  private static final String INDEX_PLAIN = "index";
-  private static final String INDEX_SORTED = "index-sorted";
-  private static final int NUM_DOCS = 254;
-  private String[] fieldNames = new String[] {
-      "id",
-      "url",
-      "site",
-      "content",
-      "host",
-      "anchor",
-      "boost"
-  };
-  
-  Configuration conf = null;
-  File testDir = null;
-  Directory dir = null;
-  
-  
-  protected void setUp() throws Exception {
-    if (conf == null) conf = NutchConfiguration.create();
-    // create test index
-    testDir = new File("indexSorter-test-" + System.currentTimeMillis());
-    if (!testDir.mkdirs()) {
-      throw new Exception("Can't create test dir " + testDir.toString());
-    }
-    LOG.info("Creating test index: " + testDir.getAbsolutePath());
-    File plain = new File(testDir, INDEX_PLAIN);
-    Directory dir = FSDirectory.open(plain);
-    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true,
-    		MaxFieldLength.UNLIMITED);
-    // create test documents
-    for (int i = 0; i < NUM_DOCS; i++) {
-      Document doc = new Document();
-      for (int k = 0; k < fieldNames.length; k++) {
-        Field f;
-        Store s;
-        Index ix;
-        String val = null;
-        if (fieldNames[k].equals("id")) {
-          s = Store.YES;
-          ix = Index.NOT_ANALYZED;
-          val = String.valueOf(i);
-        } else if (fieldNames[k].equals("host")) {
-          s = Store.YES;
-          ix = Index.NOT_ANALYZED;
-          val = "www.example" + i + ".com";
-        } else if (fieldNames[k].equals("site")) {
-          s = Store.NO;
-          ix = Index.NOT_ANALYZED;
-          val = "www.example" + i + ".com";
-        } else if (fieldNames[k].equals("content")) {
-          s = Store.NO;
-          ix = Index.ANALYZED;
-          val = "This is the content of the " + i + "-th document.";
-        } else if (fieldNames[k].equals("boost")) {
-          s = Store.YES;
-          ix = Index.NO;
-          // XXX note that this way we ensure different values of encoded boost
-          // XXX note also that for this reason we can't reliably test more than
-          // XXX 255 documents.
-          float boost = Similarity.decodeNorm((byte)(i + 1));
-          val = String.valueOf(boost);
-          doc.setBoost(boost);
-        } else {
-          s = Store.YES;
-          ix = Index.ANALYZED;
-          if (fieldNames[k].equals("anchor")) {
-            val = "anchors to " + i + "-th page.";
-          } else if (fieldNames[k].equals("url")) {
-            val = "http://www.example" + i + ".com/" + i + ".html";
-          }
-        }
-        f = new Field(fieldNames[k], val, s, ix);
-        doc.add(f);
-      }
-      writer.addDocument(doc);
-    }
-    writer.optimize();
-    writer.close();
-  }
-  
-  protected void tearDown() throws Exception {
-    FileUtil.fullyDelete(testDir);
-  }
-  
-  public void testSorting() throws Exception {
-    IndexSorter sorter = new IndexSorter(conf);
-    sorter.sort(testDir);
-    
-    // read back documents
-    IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED)));
-    assertEquals(reader.numDocs(), NUM_DOCS);
-    for (int i = 0; i < reader.maxDoc(); i++) {
-      Document doc = reader.document(i);
-      Field f = doc.getField("content");
-      assertNull(f);
-      f = doc.getField("boost");
-      float boost = Similarity.decodeNorm((byte)(NUM_DOCS - i));
-      String cmp = String.valueOf(boost);
-      assertEquals(cmp, f.stringValue());
-    }
-    reader.close();
-  }
-
-}
Index: src/test/org/apache/nutch/ontology/TestOntologyFactory.java
===================================================================
--- src/test/org/apache/nutch/ontology/TestOntologyFactory.java	(revision 959954)
+++ src/test/org/apache/nutch/ontology/TestOntologyFactory.java	(working copy)
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.ontology;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestOntologyFactory extends TestCase {
-
-  private Configuration conf;
-  
-  protected void setUp() throws Exception {
-      conf = NutchConfiguration.create();
-      conf.set("plugin.includes", ".*");
-  }
-
-  public void testGetOntology() {
-    OntologyFactory factory=new OntologyFactory(conf);
-    
-    try {
-      Ontology ontology1=factory.getOntology();
-      Ontology ontology2=factory.getOntology();
-      assertNotNull(ontology1);
-      assertNotNull(ontology2);
-      //Current implementation creates new object instance in every call
-      //TODO: check if this is required  
-      assertNotSame(ontology1, ontology2);
-    } catch (PluginRuntimeException e) {
-      fail("should not trow:" + e);
-    }
-  }
-}
Index: src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
===================================================================
--- src/java/org/apache/nutch/clustering/OnlineClustererFactory.java	(revision 959954)
+++ src/java/org/apache/nutch/clustering/OnlineClustererFactory.java	(working copy)
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.nutch.plugin.*;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * A factory for retrieving {@link OnlineClusterer} extensions.
- *
- * @author Dawid Weiss
- * @version $Id$
- */
-public class OnlineClustererFactory {
-  public static final Log LOG = LogFactory.getLog(OnlineClustererFactory.class);
-  
-  /**
-   * Nutch configuration key specifying a particular clustering extension
-   * to use. 
-   */
-  private final static String CONFIG_FIELD_NAME = "extension.clustering.extension-name";
-
-  /**
-   * An {@link ExtensionPoint} pointing to {@link OnlineClusterer}. 
-   */
-  private ExtensionPoint extensionPoint;
-  
-  /**
-   * Default clustering extension implementation retrieved from the
-   * configuration file or <code>null</code> if the default (first encountered extension)
-   * is to be used.
-   */
-  private String extensionName;
-
-  /**
-   * Create an instance of the clustering factory bound to
-   * a given configuration.
-   */
-  public OnlineClustererFactory(Configuration conf) {
-      this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(OnlineClusterer.X_POINT_ID);
-      this.extensionName = conf.get(CONFIG_FIELD_NAME);
-  }
-
-  /**
-  * @return Returns the online clustering extension specified
-  * in nutch configuration (key name is <code>extension.clustering.extension-name</code>). 
-  * If the name is empty (no preference), the first available clustering extension is
-  * returned.
-  */
-  public OnlineClusterer getOnlineClusterer()
-    throws PluginRuntimeException {
-
-    if (this.extensionPoint == null) {
-      // not even an extension point defined.
-      return null;
-    }
-    
-    if (extensionName != null) {
-      Extension extension = findExtension(extensionName);
-      if (extension != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Using clustering extension: " + extensionName);
-        }
-        return (OnlineClusterer) extension.getExtensionInstance();
-      }
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Clustering extension not found: '" + extensionName +
-                 "', trying the default");
-      }
-      // not found, fallback to the default, if available.
-    }
-
-    final Extension[] extensions = this.extensionPoint.getExtensions();
-    if (extensions.length > 0) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Using the first clustering extension found: " +
-                 extensions[0].getId());
-      }
-      return (OnlineClusterer) extensions[0].getExtensionInstance();
-    } else {
-      return null;
-    }
-  }
-
-  private Extension findExtension(String name)
-    throws PluginRuntimeException {
-
-    final Extension[] extensions = this.extensionPoint.getExtensions();
-    for (int i = 0; i < extensions.length; i++) {
-      final Extension extension = extensions[i];
-      if (name.equals(extension.getId()))
-        return extension;
-    }
-    return null;
-  }
-
-} 
Index: src/java/org/apache/nutch/clustering/HitsCluster.java
===================================================================
--- src/java/org/apache/nutch/clustering/HitsCluster.java	(revision 959954)
+++ src/java/org/apache/nutch/clustering/HitsCluster.java	(working copy)
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering;
-
-import org.apache.nutch.searcher.HitDetails;
-
-/**
- * An interface representing a group (cluster) of related hits.
- * 
- * <p>If {@link #isJunkCluster()} method returns <code>true</code>
- * then this cluster contains documents that are grouped together,
- * but no clear semantic relation has been detected. Such clusters may
- * be hidden in the user interface layer, unless someone wishes to
- * see an explicit group of documents that didn't belong anywhere else.</p>
- *
- * @author Dawid Weiss
- * @version $Id$
- */
-public interface HitsCluster {
-  /**
-   * @return Returns an array of {@link HitsCluster} objects
-   * that are sub-groups of the current group, or <code>null</code>
-   * if this cluster has no sub-groups.
-   */
-  public HitsCluster [] getSubclusters();
-  
-  /**
-   * @return Returns a relevance-ordered array of the hits belonging
-   * to this cluster or <code>null</code> if this cluster
-   * has no associated documents (it may have subclusters only).
-   */
-  public HitDetails[] getHits();
-  
-  /**
-   * @return Returns an array of labels for this cluster. The labels should
-   * be sorted according to their relevance to the cluster's content. Not
-   * all of the labels must be displayed - the application is free to
-   * set a cutoff threshold and display only the topmost labels. 
-   */
-  public String[] getDescriptionLabels();
-
-  /**
-   * Returns <code>true</code> if this cluster constains documents
-   * that did not fit anywhere else (presentation layer may
-   * discard such clusters). 
-   * 
-   * <p>Subclusters of this cluster are also junk clusters, even if
-   * they don't have this property set to <code>true</code></p>
-   */
-  public boolean isJunkCluster();
-}
Index: src/java/org/apache/nutch/clustering/OnlineClusterer.java
===================================================================
--- src/java/org/apache/nutch/clustering/OnlineClusterer.java	(revision 959954)
+++ src/java/org/apache/nutch/clustering/OnlineClusterer.java	(working copy)
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering;
-
-// Nutch imports
-import org.apache.nutch.plugin.Pluggable;
-import org.apache.nutch.searcher.HitDetails;
-
-
-/**
- * An extension point interface for online search results clustering
- * algorithms.
- *
- * <p>By the term <b>online</b> search results clustering we will understand
- * a clusterer that works on a set of {@link HitDetails} retrieved for a
- * query and able to produce a set of {@link HitsCluster} that can be displayed
- * to help the user gain more insight in the topics found in the result.</p>
- *
- * <p>Other clustering options include predefined categories and off-line
- * preclustered groups, but I do not investigate those any further here.</p>
- *
- * @author Dawid Weiss
- * @version $Id$
- */
-public interface OnlineClusterer extends Pluggable {
-  /** The name of the extension point. */
-  public final static String X_POINT_ID = OnlineClusterer.class.getName();
-
-  /**
-   * Clusters an array of hits ({@link HitDetails} objects) and
-   * their previously extracted summaries (<code>String</code>s).
-   * 
-   * <p>Arguments to this method may seem to be very low-level, but
-   * in fact they are side products of a regular search process, 
-   * so we simply reuse them instead of duplicating part of the usual
-   * Nutch functionality. Other ideas are welcome.</p>
-   * 
-   * <p>This method must be thread-safe (many threads may invoke
-   * it concurrently on the same instance of a clusterer).</p>
-   * 
-   * @return A set of {@link HitsCluster} objects.
-   */
-  public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions);
-}
Index: src/java/org/apache/nutch/analysis/TokenMgrError.java
===================================================================
--- src/java/org/apache/nutch/analysis/TokenMgrError.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/TokenMgrError.java	(working copy)
@@ -1,134 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 3.0 */
-package org.apache.nutch.analysis;
-
-@SuppressWarnings("serial")
-class TokenMgrError extends Error
-{
-   /*
-    * Ordinals for various reasons why an Error of this type can be thrown.
-    */
-
-   /**
-    * Lexical error occured.
-    */
-   static final int LEXICAL_ERROR = 0;
-
-   /**
-    * An attempt wass made to create a second instance of a static token manager.
-    */
-   static final int STATIC_LEXER_ERROR = 1;
-
-   /**
-    * Tried to change to an invalid lexical state.
-    */
-   static final int INVALID_LEXICAL_STATE = 2;
-
-   /**
-    * Detected (and bailed out of) an infinite loop in the token manager.
-    */
-   static final int LOOP_DETECTED = 3;
-
-   /**
-    * Indicates the reason why the exception is thrown. It will have
-    * one of the above 4 values.
-    */
-   int errorCode;
-
-   /**
-    * Replaces unprintable characters by their espaced (or unicode escaped)
-    * equivalents in the given string
-    */
-   protected static final String addEscapes(String str) {
-      StringBuffer retval = new StringBuffer();
-      char ch;
-      for (int i = 0; i < str.length(); i++) {
-        switch (str.charAt(i))
-        {
-           case 0 :
-              continue;
-           case '\b':
-              retval.append("\\b");
-              continue;
-           case '\t':
-              retval.append("\\t");
-              continue;
-           case '\n':
-              retval.append("\\n");
-              continue;
-           case '\f':
-              retval.append("\\f");
-              continue;
-           case '\r':
-              retval.append("\\r");
-              continue;
-           case '\"':
-              retval.append("\\\"");
-              continue;
-           case '\'':
-              retval.append("\\\'");
-              continue;
-           case '\\':
-              retval.append("\\\\");
-              continue;
-           default:
-              if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
-                 String s = "0000" + Integer.toString(ch, 16);
-                 retval.append("\\u" + s.substring(s.length() - 4, s.length()));
-              } else {
-                 retval.append(ch);
-              }
-              continue;
-        }
-      }
-      return retval.toString();
-   }
-
-   /**
-    * Returns a detailed message for the Error when it is thrown by the
-    * token manager to indicate a lexical error.
-    * Parameters : 
-    *    EOFSeen     : indicates if EOF caused the lexicl error
-    *    curLexState : lexical state in which this error occured
-    *    errorLine   : line number when the error occured
-    *    errorColumn : column number when the error occured
-    *    errorAfter  : prefix that was seen before this error occured
-    *    curchar     : the offending character
-    * Note: You can customize the lexical error message by modifying this method.
-    */
-   protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
-      return("Lexical error at line " +
-           errorLine + ", column " +
-           errorColumn + ".  Encountered: " +
-           (EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
-           "after : \"" + addEscapes(errorAfter) + "\"");
-   }
-
-   /**
-    * You can also modify the body of this method to customize your error messages.
-    * For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
-    * of end-users concern, so you can return something like : 
-    *
-    *     "Internal Error : Please file a bug report .... "
-    *
-    * from this method for such cases in the release version of your parser.
-    */
-   public String getMessage() {
-      return super.getMessage();
-   }
-
-   /*
-    * Constructors of various flavors follow.
-    */
-
-   public TokenMgrError() {
-   }
-
-   public TokenMgrError(String message, int reason) {
-      super(message);
-      errorCode = reason;
-   }
-
-   public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
-      this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
-   }
-}
Index: src/java/org/apache/nutch/analysis/NutchAnalysis.jj
===================================================================
--- src/java/org/apache/nutch/analysis/NutchAnalysis.jj	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchAnalysis.jj	(working copy)
@@ -1,382 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/** JavaCC code for the Nutch lexical analyzer. */
-
-options {
-  STATIC = false;
-  USER_CHAR_STREAM = true;
-  OPTIMIZE_TOKEN_MANAGER = true;
-  UNICODE_INPUT = true;
-//DEBUG_TOKEN_MANAGER = true;
-}
-
-PARSER_BEGIN(NutchAnalysis)
-
-package org.apache.nutch.analysis;
-
-import java.io.StringReader;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.QueryFilters;
-import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-
-import java.io.*;
-import java.util.*;
-
-/** The JavaCC-generated Nutch lexical analyzer and query parser. */
-public class NutchAnalysis {
-
-  private static final String[] STOP_WORDS = {
-    "a", "and", "are", "as", "at", "be", "but", "by",
-    "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "s", "such",
-    "t", "that", "the", "their", "then", "there", "these",
-    "they", "this", "to", "was", "will", "with"
-  };
-
-  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);
-
-  private Analyzer analyzer = null;
-  private String queryString;
-  private QueryFilters queryFilters;
-  
-
-  /** Constructs a nutch analysis. */
-  public NutchAnalysis(String query, Analyzer analyzer) {
-    this(new FastCharStream(new StringReader(query)));
-    this.analyzer = analyzer;
-  }
-
-  /** True iff word is a stop word.  Stop words are only removed from queries.
-   * Every word is indexed.  */
-  public static boolean isStopWord(String word) {
-    return STOP_SET.contains(word);
-  }
-
-  /** Construct a query parser for the text in a reader. */
-  public static Query parseQuery(String queryString, Configuration conf) throws IOException {
-    return parseQuery(queryString, null, conf);
-  }
-
-  /** Construct a query parser for the text in a reader. */
-  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
-    throws IOException {
-    NutchAnalysis parser = new NutchAnalysis(
-          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
-    parser.queryString = queryString;
-    parser.queryFilters = new QueryFilters(conf);
-    return parser.parse(conf);
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-    while (true) {
-      System.out.print("Query: ");
-      String line = in.readLine();
-      System.out.println(parseQuery(line, NutchConfiguration.create()));
-    }
-  }
-
-}
-
-PARSER_END(NutchAnalysis)
-
-TOKEN_MGR_DECLS : {
-
-  /** Constructs a token manager for the provided Reader. */
-  public NutchAnalysisTokenManager(Reader reader) {
-    this(new FastCharStream(reader));
-  }
-
-}
-
-TOKEN : {					  // token regular expressions
-
-  // basic word -- lowercase it
-<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
-  { matchedToken.image = matchedToken.image.toLowerCase(); }
-
-  // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
-| <ACRONYM: <LETTER> "." (<LETTER> ".")+ > 
-    {                                             // remove dots
-      for (int i = 0; i < image.length(); i++) {
-	if (image.charAt(i) == '.')
-	  image.deleteCharAt(i--);
-      }
-      matchedToken.image = image.toString().toLowerCase();
-    }
-
-  // chinese, japanese and korean characters
-| <SIGRAM: <CJK> >
-
-   // irregular words
-| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
-| <#C_PLUS_PLUS: ("C"|"c") "++" >
-| <#C_SHARP: ("C"|"c") "#" >
-
-  // query syntax characters
-| <PLUS: "+" >
-| <MINUS: "-" >
-| <QUOTE: "\"" >
-| <COLON: ":" >
-| <SLASH: "/" >
-| <DOT: "." >
-| <ATSIGN: "@" >
-| <APOSTROPHE: "'" >
-
-| <WHITE: ~[] >                                   // treat unrecognized chars
-                                                  // as whitespace
-// primitive, non-token patterns
-
-| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words
-
-| < #LETTER:					  // alphabets
-    [
-        "\u0041"-"\u005a",
-        "\u0061"-"\u007a",
-        "\u00c0"-"\u00d6",
-        "\u00d8"-"\u00f6",
-        "\u00f8"-"\u00ff",
-        "\u0100"-"\u1fff"
-    ]
-    >
-
-|  <#CJK:                                        // non-alphabets
-      [
-       "\u3040"-"\u318f",
-       "\u3300"-"\u337f",
-       "\u3400"-"\u3d2d",
-       "\u4e00"-"\u9fff",
-       "\uf900"-"\ufaff"
-      ]
-    >    
-
-| < #DIGIT:					  // unicode digits
-      [
-       "\u0030"-"\u0039",
-       "\u0660"-"\u0669",
-       "\u06f0"-"\u06f9",
-       "\u0966"-"\u096f",
-       "\u09e6"-"\u09ef",
-       "\u0a66"-"\u0a6f",
-       "\u0ae6"-"\u0aef",
-       "\u0b66"-"\u0b6f",
-       "\u0be7"-"\u0bef",
-       "\u0c66"-"\u0c6f",
-       "\u0ce6"-"\u0cef",
-       "\u0d66"-"\u0d6f",
-       "\u0e50"-"\u0e59",
-       "\u0ed0"-"\u0ed9",
-       "\u1040"-"\u1049"
-      ]
-  >
-
-}
-
-
-/** Parse a query. */
-Query parse(Configuration conf) :
-{
-  Query query = new Query(conf);
-  ArrayList terms;
-  Token token;
-  String field;
-  boolean stop;
-  boolean prohibited;
-
-}
-{
-  nonOpOrTerm()                                   // skip noise
-  (
-    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; }
-
-                                                  // optional + or - operator
-    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?
-
-                                                  // optional field spec.
-    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
-      token=<WORD> <COLON> { field = token.image; } )?
-
-    ( terms=phrase(field) {stop=false;} |         // quoted terms or
-      terms=compound(field))                      // single or compound term
-
-    nonOpOrTerm()                                 // skip noise
-
-    {
-      String[] array = (String[])terms.toArray(new String[terms.size()]);
-
-      if (stop
-          && field == Clause.DEFAULT_FIELD
-          && terms.size()==1
-          && isStopWord(array[0])) {
-        // ignore stop words only when single, unadorned terms in default field
-      } else {
-        if (prohibited)
-          query.addProhibitedPhrase(array, field);
-        else
-          query.addRequiredPhrase(array, field);
-      }
-    }
-  )*
-  
-  { return query; }
-
-}
-
-/** Parse an explcitly quoted phrase query.  Note that this may return a single
- * term, a trivial phrase.*/
-ArrayList phrase(String field) :
-{
-  int start;
-  int end;
-  ArrayList result = new ArrayList();
-  String term;
-}
-{
-  <QUOTE>
-
-  { start = token.endColumn; }
-  
-  (nonTerm())*                                    // skip noise
-  ( term = term() { result.add(term); }           // parse a term
-    (nonTerm())*)*                                // skip noise
-
-  { end = token.endColumn; }
-
-  (<QUOTE>|<EOF>)
-    
-  {
-    if (this.queryFilters.isRawField(field)) {
-      result.clear();
-      result.add(queryString.substring(start, end));
-    }
-    return result;
-  }
-
-}
-
-/** Parse a compound term that is interpreted as an implicit phrase query.
- * Compounds are a sequence of terms separated by infix characters.  Note that
- * this may return a single term, a trivial compound. */
-ArrayList compound(String field) :
-{
-  int start;
-  ArrayList result = new ArrayList();
-  String term;
-  StringBuffer terms = new StringBuffer();
-}
-{
-  { start = token.endColumn; }
-
-  term = term() {
-    terms.append(term).append(" ");
-    //result.add(term);
-  }
-  ( LOOKAHEAD( (infix())+ term() )
-    (infix())+
-    term = term() {
-      terms.append(term).append(" ");
-      //result.add(term);
-    })*
-
-  {
-    if (this.queryFilters.isRawField(field)) {
-//      result.clear();
-      result.add(queryString.substring(start, token.endColumn));
-
-    } else {
-      TokenStream tokens = analyzer.tokenStream(
-                              field, new StringReader(terms.toString()));
-
-      TermAttribute ta = tokens.getAttribute(TermAttribute.class);
-      try
-      {
-        String termText;
-        while (tokens.incrementToken())
-        {
-          if ((termText = ta.term()) == null)
-            break;
-          result.add(termText);
-        }
-      } catch (IOException e) {
-        // ignore (?)
-      }
-//
-      try {
-        tokens.close();
-      } catch (IOException e) {
-        // ignore
-      }
-    }
-    return result;
-  }
-
-}
-
-/** Parse a single term. */
-String term() :
-{
-  Token token;
-}
-{
-  ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)
-
-  { return token.image; }
-}
-
-
-/** Parse anything but a term or a quote. */
-void nonTerm() :
-{}
-{
-  <WHITE> | infix()
-}
-
-void nonTermOrEOF() :
-{}
-{
-  nonTerm() | <EOF>
-}
-
-/** Parse anything but a term or an operator (plur or minus or quote). */
-void nonOpOrTerm() :
-{}
-{
-  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
-}
-
-/** Characters which can be used to form compound terms. */
-void infix() :
-{}
-{
-  <PLUS> | <MINUS> | nonOpInfix()
-}
-
-/** Parse infix characters except plus and minus. */
-void nonOpInfix() :
-{}
-{
-  <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
-}
-
Index: src/java/org/apache/nutch/analysis/FastCharStream.java
===================================================================
--- src/java/org/apache/nutch/analysis/FastCharStream.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/FastCharStream.java	(working copy)
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis;
-
-import java.io.*;
-
-/** An efficient implementation of JavaCC's CharStream interface.  <p>Note that
- * this does not do line-number counting, but instead keeps track of the
- * character position of the token in the input, as required by Lucene's {@link
- * org.apache.lucene.analysis.Token} API. */
-final class FastCharStream implements CharStream {
-  char[] buffer = null;
-
-  int bufferLength = 0;				  // end of valid chars
-  int bufferPosition = 0;			  // next char to read
-  
-  int tokenStart = 0;				  // offset in buffer
-  int bufferStart = 0;				  // position in file of buffer
-
-  Reader input;					  // source of chars
-
-  /** Constructs from a Reader. */
-  public FastCharStream(Reader r) {
-    input = r;
-  }
-
-  public final char readChar() throws IOException {
-    if (bufferPosition >= bufferLength)
-      refill();
-    return buffer[bufferPosition++];
-  }
-
-  private final void refill() throws IOException {
-    int newPosition = bufferLength - tokenStart;
-
-    if (tokenStart == 0) {			  // token won't fit in buffer
-      if (buffer == null) {			  // first time: alloc buffer
-	buffer = new char[2048];		  
-      } else if (bufferLength == buffer.length) { // grow buffer
-	char[] newBuffer = new char[buffer.length*2];
-	System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
-	buffer = newBuffer;
-      }
-    } else {					  // shift token to front
-      System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
-    }
-
-    bufferLength = newPosition;			  // update state
-    bufferPosition = newPosition;
-    bufferStart += tokenStart;
-    tokenStart = 0;
-
-    int charsRead =				  // fill space in buffer
-      input.read(buffer, newPosition, buffer.length-newPosition);
-    if (charsRead == -1)
-      throw new IOException("read past eof");
-    else
-      bufferLength += charsRead;
-  }
-
-  public final char BeginToken() throws IOException {
-    tokenStart = bufferPosition;
-    return readChar();
-  }
-
-  public final void backup(int amount) {
-    bufferPosition -= amount;
-  }
-
-  public final String GetImage() {
-    return new String(buffer, tokenStart, bufferPosition - tokenStart);
-  }
-
-  public final char[] GetSuffix(int len) {
-    char[] value = new char[len];
-    System.arraycopy(buffer, bufferPosition - len, value, 0, len);
-    return value;
-  }
-
-  public final void Done() {
-    try {
-      input.close();
-    } catch (IOException e) {
-      System.err.println("Caught: " + e + "; ignoring.");
-    }
-  }
-
-  public final int getColumn() {
-    return bufferStart + bufferPosition;
-  }
-  public final int getLine() {
-    return 1;
-  }
-  public final int getEndColumn() {
-    return bufferStart + bufferPosition;
-  }
-  public final int getEndLine() {
-    return 1;
-  }
-  public final int getBeginColumn() {
-    return bufferStart + tokenStart;
-  }
-  public final int getBeginLine() {
-    return 1;
-  }
-}
Index: src/java/org/apache/nutch/analysis/TokenManager.java
===================================================================
--- src/java/org/apache/nutch/analysis/TokenManager.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/TokenManager.java	(working copy)
@@ -1,17 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. TokenManager.java Version 2.1 */
-package org.apache.nutch.analysis;
-
-/**
- * An implementation for this interface is generated by
- * JavaCCParser.  The user is free to use any implementation
- * of their choice.
- */
-
-interface TokenManager {
-
-  /** This gets the next token from the input stream.
-   *  A token of kind 0 (<EOF>) should be returned on EOF.
-   */
-  public Token getNextToken();
-
-}
Index: src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java	(working copy)
@@ -1,110 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis;
-
-import java.io.*;
-
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.*;
-
-/** The tokenizer used for Nutch document text.  Implemented in terms of our
- * JavaCC-generated lexical analyzer, {@link NutchAnalysisTokenManager}, shared
- * with the query parser.
- */
-public final class NutchDocumentTokenizer extends Tokenizer
-  implements NutchAnalysisConstants {
-
-  private final NutchAnalysisTokenManager tokenManager;
-
-  private final TermAttribute termAtt;
-  private final PositionIncrementAttribute posIncrAtt;
-  private final TypeAttribute typeAtt;
-  private final OffsetAttribute offsetAtt;
-
-  /** Construct a tokenizer for the text in a Reader. */
-  public NutchDocumentTokenizer(Reader reader) {
-    super(reader);
-
-    tokenManager = new NutchAnalysisTokenManager(reader); 
-    this.termAtt = addAttribute(TermAttribute.class);
-    this.offsetAtt = addAttribute(OffsetAttribute.class);
-    this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-    this.typeAtt = addAttribute(TypeAttribute.class);
-  }
-
-  /** Returns the next token in the stream, or null at EOF. */
-  private final Token next() throws IOException {
-    org.apache.nutch.analysis.Token t;
-
-    try {
-      loop: {
-        while (true) {
-          t = tokenManager.getNextToken();
-          switch (t.kind) {                       // skip query syntax tokens
-          case EOF: case WORD: case ACRONYM: case SIGRAM:
-            break loop;
-          default:
-          }
-        }
-      }
-    } catch (TokenMgrError e) {                   // translate exceptions
-      throw new IOException("Tokenizer error:" + e);
-    }
-
-    if (t.kind == EOF)                            // translate tokens
-      return null;
-    else {
-      return new Token(t.image,t.beginColumn,t.endColumn,tokenImage[t.kind]);
-    }
-  }
-
-  /** Lucene 3.0 API. */
-  public boolean incrementToken() throws IOException
-  {
-    clearAttributes();
-
-    final Token t = next();
-    if (t != null) {
-      termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
-      offsetAtt.setOffset(t.startOffset(), t.endOffset());
-      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
-      typeAtt.setType(t.type());
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-    while (true) {
-      System.out.print("Text: ");
-      String line = in.readLine();
-      Tokenizer tokenizer = new NutchDocumentTokenizer(new StringReader(line));
-      TermAttribute termAtt = tokenizer.getAttribute(TermAttribute.class);
-      System.out.print("Tokens: ");
-      while (tokenizer.incrementToken()) {
-        System.out.print(termAtt.term());
-        System.out.print(" ");
-      }
-      System.out.println();
-    }
-  }
-}
Index: src/java/org/apache/nutch/analysis/AnalyzerFactory.java
===================================================================
--- src/java/org/apache/nutch/analysis/AnalyzerFactory.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/AnalyzerFactory.java	(working copy)
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.analysis;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-// Nutch imports
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.util.ObjectCache;
-import org.apache.hadoop.conf.Configuration;
-
-
-/**
- * Creates and caches {@link NutchAnalyzer} plugins.
- *
- * @author J&eacute;r&ocirc;me Charron
- */
-public class AnalyzerFactory {
-
-  private final static String KEY = AnalyzerFactory.class.getName();
-  
-  public final static Log LOG = LogFactory.getLog(KEY);
-
-  
-  private NutchAnalyzer DEFAULT_ANALYZER;
-  
-  private ExtensionPoint extensionPoint;
-  private Configuration conf;
-
-  public AnalyzerFactory (Configuration conf) {
-      DEFAULT_ANALYZER = new NutchDocumentAnalyzer(conf);
-      this.conf = conf;
-      this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(NutchAnalyzer.X_POINT_ID);
-      if(this.extensionPoint == null) {
-          throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
-          " not found.");
-      }
-  }
-
-  public static AnalyzerFactory get(Configuration conf) {
-    ObjectCache objectCache = ObjectCache.get(conf);
-    AnalyzerFactory factory = (AnalyzerFactory) objectCache.getObject(KEY);
-    if (factory == null) {
-      factory = new AnalyzerFactory(conf);
-      objectCache.setObject(KEY, factory);
-    }
-    return factory;
-  }
-  
-  /**
-   * Returns the appropriate {@link NutchAnalyzer analyzer} implementation
-   * given a language code.
-   *
-   * <p>NutchAnalyzer extensions should define the attribute "lang". The first
-   * plugin found whose "lang" attribute equals the specified lang parameter is
-   * used. If none match, then the {@link NutchDocumentAnalyzer} is used.
-   */
-  public NutchAnalyzer get(String lang) {
-
-    NutchAnalyzer analyzer = DEFAULT_ANALYZER;
-    Extension extension = getExtension(lang);
-    if (extension != null) {
-        try {
-            analyzer = (NutchAnalyzer) extension.getExtensionInstance();
-        } catch (PluginRuntimeException pre) {
-            analyzer = DEFAULT_ANALYZER;
-        }
-    }
-    return analyzer;
-  }
-
-  private Extension getExtension(String lang) {
-    ObjectCache objectCache = ObjectCache.get(conf);
-    if (lang == null) { return null; }
-    Extension extension = (Extension) objectCache.getObject(lang);
-    if (extension == null) {
-      extension = findExtension(lang);
-      if (extension != null) {
-        objectCache.setObject(lang, extension);
-      }
-    }
-    return extension;
-  }
-
-  private Extension findExtension(String lang) {
-
-    if (lang != null) {
-      Extension[] extensions = this.extensionPoint.getExtensions();
-      for (int i=0; i<extensions.length; i++) {
-        if (lang.equals(extensions[i].getAttribute("lang"))) {
-          return extensions[i];
-        }
-      }
-    }
-    return null;
-  }
-
-  /** 
-   * Method used by unit test
-   */
-  protected NutchAnalyzer getDefault() {
-    return DEFAULT_ANALYZER;
-  }
-
-}
Index: src/java/org/apache/nutch/analysis/package.html
===================================================================
--- src/java/org/apache/nutch/analysis/package.html	(revision 959954)
+++ src/java/org/apache/nutch/analysis/package.html	(working copy)
@@ -1,5 +0,0 @@
-<html>
-<body>
-Tokenizer for documents and query parser.
-</body>
-</html>
Index: src/java/org/apache/nutch/analysis/Token.java
===================================================================
--- src/java/org/apache/nutch/analysis/Token.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/Token.java	(working copy)
@@ -1,123 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. Token.java Version 4.1 */
-package org.apache.nutch.analysis;
-
-/**
- * Describes the input token stream.
- */
-
-class Token {
-
-  /**
-   * An integer that describes the kind of this token.  This numbering
-   * system is determined by JavaCCParser, and a table of these numbers is
-   * stored in the file ...Constants.java.
-   */
-  public int kind;
-
-  /** The line number of the first character of this Token. */
-  public int beginLine;
-  /** The column number of the first character of this Token. */
-  public int beginColumn;
-  /** The line number of the last character of this Token. */
-  public int endLine;
-  /** The column number of the last character of this Token. */
-  public int endColumn;
-
-  /**
-   * The string image of the token.
-   */
-  public String image;
-
-  /**
-   * A reference to the next regular (non-special) token from the input
-   * stream.  If this is the last token from the input stream, or if the
-   * token manager has not read tokens beyond this one, this field is
-   * set to null.  This is true only if this token is also a regular
-   * token.  Otherwise, see below for a description of the contents of
-   * this field.
-   */
-  public Token next;
-
-  /**
-   * This field is used to access special tokens that occur prior to this
-   * token, but after the immediately preceding regular (non-special) token.
-   * If there are no such special tokens, this field is set to null.
-   * When there are more than one such special token, this field refers
-   * to the last of these special tokens, which in turn refers to the next
-   * previous special token through its specialToken field, and so on
-   * until the first special token (whose specialToken field is null).
-   * The next fields of special tokens refer to other special tokens that
-   * immediately follow it (without an intervening regular token).  If there
-   * is no such token, this field is null.
-   */
-  public Token specialToken;
-
-  /**
-   * An optional attribute value of the Token.
-   * Tokens which are not used as syntactic sugar will often contain
-   * meaningful values that will be used later on by the compiler or
-   * interpreter. This attribute value is often different from the image.
-   * Any subclass of Token that actually wants to return a non-null value can
-   * override this method as appropriate.
-   */
-  public Object getValue() {
-    return null;
-  }
-
-  /**
-   * No-argument constructor
-   */
-  public Token() {}
-
-  /**
-   * Constructs a new token for the specified Image.
-   */
-  public Token(int kind)
-  {
-     this(kind, null);
-  }
-
-  /**
-   * Constructs a new token for the specified Image and Kind.
-   */
-  public Token(int kind, String image)
-  {
-     this.kind = kind;
-     this.image = image;
-  }
-
-  /**
-   * Returns the image.
-   */
-  public String toString()
-  {
-     return image;
-  }
-
-  /**
-   * Returns a new Token object, by default. However, if you want, you
-   * can create and return subclass objects based on the value of ofKind.
-   * Simply add the cases to the switch for all those special cases.
-   * For example, if you have a subclass of Token called IDToken that
-   * you want to create if ofKind is ID, simply add something like :
-   *
-   *    case MyParserConstants.ID : return new IDToken(ofKind, image);
-   *
-   * to the following switch statement. Then you can cast matchedToken
-   * variable to the appropriate type and use sit in your lexical actions.
-   */
-  public static Token newToken(int ofKind, String image)
-  {
-     switch(ofKind)
-     {
-       default : return new Token(ofKind, image);
-     }
-  }
-
-  public static Token newToken(int ofKind)
-  {
-     return newToken(ofKind, null);
-  }
-
-}
-/* JavaCC - OriginalChecksum=6925860b4b6a41d42c759eab47d0d3a3 (do not edit this line) */
Index: src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java	(working copy)
@@ -1,455 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. NutchAnalysisTokenManager.java */
-package org.apache.nutch.analysis;
-import java.io.*;
-
-/** Token Manager. */
-public class NutchAnalysisTokenManager implements NutchAnalysisConstants
-{
-  /** Constructs a token manager for the provided Reader. */
-  public NutchAnalysisTokenManager(Reader reader) {
-    this(new FastCharStream(reader));
-  }
-
-  /** Debug output. */
-  public  java.io.PrintStream debugStream = System.out;
-  /** Set debug output. */
-  public  void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
-private final int jjStopStringLiteralDfa_0(int pos, long active0)
-{
-   switch (pos)
-   {
-      default :
-         return -1;
-   }
-}
-private final int jjStartNfa_0(int pos, long active0)
-{
-   return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1);
-}
-private int jjStopAtPos(int pos, int kind)
-{
-   jjmatchedKind = kind;
-   jjmatchedPos = pos;
-   return pos + 1;
-}
-private int jjMoveStringLiteralDfa0_0()
-{
-   switch(curChar)
-   {
-      case 34:
-         return jjStopAtPos(0, 9);
-      case 39:
-         return jjStopAtPos(0, 14);
-      case 43:
-         return jjStopAtPos(0, 7);
-      case 45:
-         return jjStopAtPos(0, 8);
-      case 46:
-         return jjStopAtPos(0, 12);
-      case 47:
-         return jjStopAtPos(0, 11);
-      case 58:
-         return jjStopAtPos(0, 10);
-      case 64:
-         return jjStopAtPos(0, 13);
-      default :
-         return jjMoveNfa_0(1, 0);
-   }
-}
-static final long[] jjbitVec0 = {
-   0xfffffffeL, 0x0L, 0x0L, 0x0L
-};
-static final long[] jjbitVec2 = {
-   0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
-};
-static final long[] jjbitVec3 = {
-   0x1ff0000000000000L, 0xffffffffffffc000L, 0xffffffffL, 0x600000000000000L
-};
-static final long[] jjbitVec4 = {
-   0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
-};
-static final long[] jjbitVec5 = {
-   0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0x0L
-};
-static final long[] jjbitVec6 = {
-   0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
-};
-static final long[] jjbitVec7 = {
-   0x3fffffffffffL, 0x0L, 0x0L, 0x0L
-};
-private int jjMoveNfa_0(int startState, int curPos)
-{
-   int startsAt = 0;
-   jjnewStateCnt = 10;
-   int i = 1;
-   jjstateSet[0] = startState;
-   int kind = 0x7fffffff;
-   for (;;)
-   {
-      if (++jjround == 0x7fffffff)
-         ReInitRounds();
-      if (curChar < 64)
-      {
-         long l = 1L << curChar;
-         do
-         {
-            switch(jjstateSet[--i])
-            {
-               case 1:
-               case 0:
-                  if ((0x3ff004000000000L & l) == 0L)
-                     break;
-                  kind = 1;
-                  jjCheckNAdd(0);
-                  break;
-               case 2:
-                  if (curChar == 46)
-                     jjCheckNAdd(3);
-                  break;
-               case 4:
-                  if (curChar != 46)
-                     break;
-                  if (kind > 2)
-                     kind = 2;
-                  jjCheckNAdd(3);
-                  break;
-               case 7:
-                  if (curChar == 35)
-                     kind = 1;
-                  break;
-               case 8:
-                  if (curChar == 43 && kind > 1)
-                     kind = 1;
-                  break;
-               case 9:
-                  if (curChar == 43)
-                     jjstateSet[jjnewStateCnt++] = 8;
-                  break;
-               default : break;
-            }
-         } while(i != startsAt);
-      }
-      else if (curChar < 128)
-      {
-         long l = 1L << (curChar & 077);
-         do
-         {
-            switch(jjstateSet[--i])
-            {
-               case 1:
-                  if ((0x7fffffe87fffffeL & l) != 0L)
-                  {
-                     if (kind > 1)
-                        kind = 1;
-                     jjCheckNAdd(0);
-                  }
-                  if ((0x7fffffe07fffffeL & l) != 0L)
-                     jjstateSet[jjnewStateCnt++] = 2;
-                  if ((0x800000008L & l) != 0L)
-                     jjAddStates(0, 1);
-                  break;
-               case 0:
-                  if ((0x7fffffe87fffffeL & l) == 0L)
-                     break;
-                  if (kind > 1)
-                     kind = 1;
-                  jjCheckNAdd(0);
-                  break;
-               case 3:
-                  if ((0x7fffffe07fffffeL & l) != 0L)
-                     jjstateSet[jjnewStateCnt++] = 4;
-                  break;
-               case 6:
-                  if ((0x800000008L & l) != 0L)
-                     jjAddStates(0, 1);
-                  break;
-               default : break;
-            }
-         } while(i != startsAt);
-      }
-      else
-      {
-         int hiByte = (int)(curChar >> 8);
-         int i1 = hiByte >> 6;
-         long l1 = 1L << (hiByte & 077);
-         int i2 = (curChar & 0xff) >> 6;
-         long l2 = 1L << (curChar & 077);
-         do
-         {
-            switch(jjstateSet[--i])
-            {
-               case 1:
-                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
-                  {
-                     if (kind > 1)
-                        kind = 1;
-                     jjCheckNAdd(0);
-                  }
-                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
-                     jjstateSet[jjnewStateCnt++] = 2;
-                  if (jjCanMove_1(hiByte, i1, i2, l1, l2))
-                  {
-                     if (kind > 3)
-                        kind = 3;
-                  }
-                  break;
-               case 0:
-                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
-                     break;
-                  if (kind > 1)
-                     kind = 1;
-                  jjCheckNAdd(0);
-                  break;
-               case 3:
-                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
-                     jjstateSet[jjnewStateCnt++] = 4;
-                  break;
-               case 5:
-                  if (jjCanMove_1(hiByte, i1, i2, l1, l2) && kind > 3)
-                     kind = 3;
-                  break;
-               default : break;
-            }
-         } while(i != startsAt);
-      }
-      if (kind != 0x7fffffff)
-      {
-         jjmatchedKind = kind;
-         jjmatchedPos = curPos;
-         kind = 0x7fffffff;
-      }
-      ++curPos;
-      if ((i = jjnewStateCnt) == (startsAt = 10 - (jjnewStateCnt = startsAt)))
-         return curPos;
-      try { curChar = input_stream.readChar(); }
-      catch(java.io.IOException e) { return curPos; }
-   }
-}
-static final int[] jjnextStates = {
-   7, 9, 
-};
-private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
-{
-   switch(hiByte)
-   {
-      case 0:
-         return ((jjbitVec2[i2] & l2) != 0L);
-      default : 
-         if ((jjbitVec0[i1] & l1) != 0L)
-            return true;
-         return false;
-   }
-}
-private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, long l2)
-{
-   switch(hiByte)
-   {
-      case 48:
-         return ((jjbitVec4[i2] & l2) != 0L);
-      case 49:
-         return ((jjbitVec5[i2] & l2) != 0L);
-      case 51:
-         return ((jjbitVec6[i2] & l2) != 0L);
-      case 61:
-         return ((jjbitVec7[i2] & l2) != 0L);
-      default : 
-         if ((jjbitVec3[i1] & l1) != 0L)
-            return true;
-         return false;
-   }
-}
-
-/** Token literal values. */
-public static final String[] jjstrLiteralImages = {
-"", null, null, null, null, null, null, "\53", "\55", "\42", "\72", "\57", 
-"\56", "\100", "\47", null, null, null, null, null, };
-
-/** Lexer state names. */
-public static final String[] lexStateNames = {
-   "DEFAULT", 
-};
-protected CharStream input_stream;
-private final int[] jjrounds = new int[10];
-private final int[] jjstateSet = new int[20];
-private final StringBuffer jjimage = new StringBuffer();
-private StringBuffer image = jjimage;
-private int jjimageLen;
-private int lengthOfMatch;
-protected char curChar;
-/** Constructor. */
-public NutchAnalysisTokenManager(CharStream stream){
-   input_stream = stream;
-}
-
-/** Constructor. */
-public NutchAnalysisTokenManager(CharStream stream, int lexState){
-   this(stream);
-   SwitchTo(lexState);
-}
-
-/** Reinitialise parser. */
-public void ReInit(CharStream stream)
-{
-   jjmatchedPos = jjnewStateCnt = 0;
-   curLexState = defaultLexState;
-   input_stream = stream;
-   ReInitRounds();
-}
-private void ReInitRounds()
-{
-   int i;
-   jjround = 0x80000001;
-   for (i = 10; i-- > 0;)
-      jjrounds[i] = 0x80000000;
-}
-
-/** Reinitialise parser. */
-public void ReInit(CharStream stream, int lexState)
-{
-   ReInit(stream);
-   SwitchTo(lexState);
-}
-
-/** Switch to specified lex state. */
-public void SwitchTo(int lexState)
-{
-   if (lexState >= 1 || lexState < 0)
-      throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE);
-   else
-      curLexState = lexState;
-}
-
-protected Token jjFillToken()
-{
-   final Token t;
-   final String curTokenImage;
-   final int beginLine;
-   final int endLine;
-   final int beginColumn;
-   final int endColumn;
-   String im = jjstrLiteralImages[jjmatchedKind];
-   curTokenImage = (im == null) ? input_stream.GetImage() : im;
-   beginLine = input_stream.getBeginLine();
-   beginColumn = input_stream.getBeginColumn();
-   endLine = input_stream.getEndLine();
-   endColumn = input_stream.getEndColumn();
-   t = Token.newToken(jjmatchedKind, curTokenImage);
-
-   t.beginLine = beginLine;
-   t.endLine = endLine;
-   t.beginColumn = beginColumn;
-   t.endColumn = endColumn;
-
-   return t;
-}
-
-int curLexState = 0;
-int defaultLexState = 0;
-int jjnewStateCnt;
-int jjround;
-int jjmatchedPos;
-int jjmatchedKind;
-
-/** Get the next Token. */
-public Token getNextToken() 
-{
-  Token matchedToken;
-  int curPos = 0;
-
-  EOFLoop :
-  for (;;)
-  {   
-   try   
-   {     
-      curChar = input_stream.BeginToken();
-   }     
-   catch(java.io.IOException e)
-   {        
-      jjmatchedKind = 0;
-      matchedToken = jjFillToken();
-      return matchedToken;
-   }
-   image = jjimage;
-   image.setLength(0);
-   jjimageLen = 0;
-
-   jjmatchedKind = 0x7fffffff;
-   jjmatchedPos = 0;
-   curPos = jjMoveStringLiteralDfa0_0();
-   if (jjmatchedPos == 0 && jjmatchedKind > 15)
-   {
-      jjmatchedKind = 15;
-   }
-   if (jjmatchedKind != 0x7fffffff)
-   {
-      if (jjmatchedPos + 1 < curPos)
-         input_stream.backup(curPos - jjmatchedPos - 1);
-         matchedToken = jjFillToken();
-         TokenLexicalActions(matchedToken);
-         return matchedToken;
-   }
-   int error_line = input_stream.getEndLine();
-   int error_column = input_stream.getEndColumn();
-   String error_after = null;
-   boolean EOFSeen = false;
-   try { input_stream.readChar(); input_stream.backup(1); }
-   catch (java.io.IOException e1) {
-      EOFSeen = true;
-      error_after = curPos <= 1 ? "" : input_stream.GetImage();
-      if (curChar == '\n' || curChar == '\r') {
-         error_line++;
-         error_column = 0;
-      }
-      else
-         error_column++;
-   }
-   if (!EOFSeen) {
-      input_stream.backup(1);
-      error_after = curPos <= 1 ? "" : input_stream.GetImage();
-   }
-   throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR);
-  }
-}
-
-void TokenLexicalActions(Token matchedToken)
-{
-   switch(jjmatchedKind)
-   {
-      case 1 :
-        image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
-    matchedToken.image = matchedToken.image.toLowerCase();
-         break;
-      case 2 :
-        image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos + 1)));
-                                                  // remove dots
-      for (int i = 0; i < image.length(); i++) {
-        if (image.charAt(i) == '.')
-          image.deleteCharAt(i--);
-      }
-      matchedToken.image = image.toString().toLowerCase();
-         break;
-      default : 
-         break;
-   }
-}
-private void jjCheckNAdd(int state)
-{
-   if (jjrounds[state] != jjround)
-   {
-      jjstateSet[jjnewStateCnt++] = state;
-      jjrounds[state] = jjround;
-   }
-}
-private void jjAddStates(int start, int end)
-{
-   do {
-      jjstateSet[jjnewStateCnt++] = jjnextStates[start];
-   } while (start++ != end);
-}
-private void jjCheckNAddTwoStates(int state1, int state2)
-{
-   jjCheckNAdd(state1);
-   jjCheckNAdd(state2);
-}
-
-}
Index: src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java	(working copy)
@@ -1,79 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. NutchAnalysisConstants.java */
-package org.apache.nutch.analysis;
-
-
-/** 
- * Token literal values and constants.
- * Generated by org.javacc.parser.OtherFilesGen#start()
- */
-public interface NutchAnalysisConstants {
-
-  /** End of File. */
-  int EOF = 0;
-  /** RegularExpression Id. */
-  int WORD = 1;
-  /** RegularExpression Id. */
-  int ACRONYM = 2;
-  /** RegularExpression Id. */
-  int SIGRAM = 3;
-  /** RegularExpression Id. */
-  int IRREGULAR_WORD = 4;
-  /** RegularExpression Id. */
-  int C_PLUS_PLUS = 5;
-  /** RegularExpression Id. */
-  int C_SHARP = 6;
-  /** RegularExpression Id. */
-  int PLUS = 7;
-  /** RegularExpression Id. */
-  int MINUS = 8;
-  /** RegularExpression Id. */
-  int QUOTE = 9;
-  /** RegularExpression Id. */
-  int COLON = 10;
-  /** RegularExpression Id. */
-  int SLASH = 11;
-  /** RegularExpression Id. */
-  int DOT = 12;
-  /** RegularExpression Id. */
-  int ATSIGN = 13;
-  /** RegularExpression Id. */
-  int APOSTROPHE = 14;
-  /** RegularExpression Id. */
-  int WHITE = 15;
-  /** RegularExpression Id. */
-  int WORD_PUNCT = 16;
-  /** RegularExpression Id. */
-  int LETTER = 17;
-  /** RegularExpression Id. */
-  int CJK = 18;
-  /** RegularExpression Id. */
-  int DIGIT = 19;
-
-  /** Lexical state. */
-  int DEFAULT = 0;
-
-  /** Literal token values. */
-  String[] tokenImage = {
-    "<EOF>",
-    "<WORD>",
-    "<ACRONYM>",
-    "<SIGRAM>",
-    "<IRREGULAR_WORD>",
-    "<C_PLUS_PLUS>",
-    "<C_SHARP>",
-    "\"+\"",
-    "\"-\"",
-    "\"\\\"\"",
-    "\":\"",
-    "\"/\"",
-    "\".\"",
-    "\"@\"",
-    "\"\\\'\"",
-    "<WHITE>",
-    "<WORD_PUNCT>",
-    "<LETTER>",
-    "<CJK>",
-    "<DIGIT>",
-  };
-
-}
Index: src/java/org/apache/nutch/analysis/CommonGrams.java
===================================================================
--- src/java/org/apache/nutch/analysis/CommonGrams.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/CommonGrams.java	(working copy)
@@ -1,323 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis;
-
-import java.io.*;
-import java.util.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.tokenattributes.*;
-import org.apache.nutch.searcher.Query.Phrase;
-import org.apache.nutch.searcher.Query.Term;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.ObjectCache;
-
-/** Construct n-grams for frequently occurring terms and phrases while indexing.
- * Optimize phrase queries to use the n-grams. Single terms are still indexed
- * too, with n-grams overlaid.  This is achieved through the use of {@link
- * Token#setPositionIncrement(int)}.*/
-public class CommonGrams {
-  private static final Log LOG = LogFactory.getLog(CommonGrams.class);
-  private static final char SEPARATOR = '-';
-  /** The key used to cache commonTerms in Configuration */
-  private static final String KEY = CommonGrams.class.getName();
-
-  private HashMap<String, HashSet<String>> commonTerms =
-    new HashMap<String, HashSet<String>>();
-  
-  /**
-   * The constructor.
-   * @param conf
-   */
-  public CommonGrams(Configuration conf) {
-      init(conf);
-  }
-
-  private static class Filter extends TokenFilter {
-    private HashSet<String> common;
-    private Token previous;
-    private LinkedList<Token> gramQueue = new LinkedList<Token>();
-    private LinkedList<Token> nextQueue = new LinkedList<Token>();
-    private StringBuffer buffer = new StringBuffer();
-
-    private final TermAttribute termAtt;
-    private final PositionIncrementAttribute posIncrAtt;
-    private final TypeAttribute typeAtt;
-    private final OffsetAttribute offsetAtt;
-
-    /** Construct an n-gram producing filter. */
-    public Filter(TokenStream input, HashSet<String> common) {
-      super(input);
-      this.common = common;
-      this.termAtt = getAttribute(TermAttribute.class);
-      this.offsetAtt = getAttribute(OffsetAttribute.class);
-      this.posIncrAtt = getAttribute(PositionIncrementAttribute.class);
-      this.typeAtt = addAttribute(TypeAttribute.class);
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      clearAttributes();
-      Token t = next();
-      if (t != null) {
-        termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
-        offsetAtt.setOffset(t.startOffset(), t.endOffset());
-        posIncrAtt.setPositionIncrement(t.getPositionIncrement());
-        typeAtt.setType(t.type());
-      }     
-      return t != null;
-    }
-
-    private Token inputNext() throws IOException {
-      if (super.input.incrementToken()) {
-        Token t = new Token(
-            termAtt.termBuffer(), 0, termAtt.termLength(),
-            offsetAtt.startOffset(), offsetAtt.endOffset());
-        t.setPositionIncrement(posIncrAtt.getPositionIncrement());
-        t.setType(typeAtt.type());
-        return t;
-      }
-      return null;
-    }
-
-    /** Inserts n-grams into a token stream. */
-    public Token next() throws IOException {
-      if (gramQueue.size() != 0)                  // consume any queued tokens
-        return gramQueue.removeFirst();
-
-      final Token token = popNext();
-      if (token == null)
-        return null;
-
-      if (!isCommon(token)) {                     // optimize simple case
-        previous = token;
-        return token;
-      }
-
-      gramQueue.add(token);                       // queue the token
-
-      ListIterator<Token> i = nextQueue.listIterator();
-      Token gram = token;
-      while (isCommon(gram)) {
-        if (previous != null && !isCommon(previous)) // queue prev gram first
-          gramQueue.addFirst(gramToken(previous, gram));
-
-        Token next = peekNext(i);
-        if (next == null)
-          break;
-
-        gram = gramToken(gram, next);             // queue next gram last
-        gramQueue.addLast(gram);
-      }
-
-      previous = token;
-      return gramQueue.removeFirst();
-    }
-
-    /** True iff token is for a common term. */
-    private boolean isCommon(Token token) {
-      return common != null && common.contains(token.term());
-    }
-
-    /** Pops nextQueue or, if empty, reads a new token. */
-    private Token popNext() throws IOException {
-      if (nextQueue.size() > 0)
-        return nextQueue.removeFirst();
-      else
-        return inputNext();
-    }
-
-    /** Return next token in nextQueue, extending it when empty. */
-    private Token peekNext(ListIterator<Token> i) throws IOException {
-      if (!i.hasNext()) {
-        Token next = inputNext();
-        if (next == null)
-          return null;
-        i.add(next);
-        i.previous();
-      }
-      return i.next();
-    }
-
-    /** Construct a compound token. */
-    private Token gramToken(Token first, Token second) {
-      buffer.setLength(0);
-      buffer.append(first.term());
-      buffer.append(SEPARATOR);
-      buffer.append(second.term());
-      Token result = new Token(buffer.toString(),
-                               first.startOffset(), second.endOffset(),
-                               "gram");
-      result.setPositionIncrement(0);
-      return result;
-    }
-  }
-
-  /** Construct using the provided config file. */
-  private void init(Configuration conf) {
-    ObjectCache objectCache = ObjectCache.get(conf);
-    // First, try to retrieve some commonTerms cached in configuration.
-    commonTerms = (HashMap<String, HashSet<String>>) objectCache.getObject(KEY);
-    if (commonTerms != null) { return; }
-
-    // Otherwise, read the terms.file
-    try {
-      commonTerms = new HashMap<String, HashSet<String>>();
-      Reader reader = conf.getConfResourceAsReader
-        (conf.get("analysis.common.terms.file"));
-      BufferedReader in = new BufferedReader(reader);
-      String line;
-      while ((line = in.readLine()) != null) {
-        line = line.trim();
-        if (line.startsWith("#") || "".equals(line)) // skip comments
-          continue;
-        TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
-        TermAttribute ta = ts.getAttribute(TermAttribute.class);
-        if (!ts.incrementToken()) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Line does not contain a field name: " + line);
-          }
-          continue;
-        }
-        String field = ta.term();
-        if (!ts.incrementToken()) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Line contains only a field name, no word: " + line);
-          }
-          continue;
-        }
-        String gram = ta.term();
-        while (ts.incrementToken()) {
-          gram = gram + SEPARATOR + ta.term();
-        }
-        HashSet<String> table = commonTerms.get(field);
-        if (table == null) {
-          table = new HashSet<String>();
-          commonTerms.put(field, table);
-        }
-        table.add(gram);
-      }
-      objectCache.setObject(KEY, commonTerms);
-    } catch (IOException e) {
-      throw new RuntimeException(e.toString());
-    }
-  }
-
-  /** Construct a token filter that inserts n-grams for common terms.  For use
-   * while indexing documents.  */
-  public TokenFilter getFilter(TokenStream ts, String field) {
-    return new Filter(ts, commonTerms.get(field));
-  }
-
-  /** Utility to convert an array of Query.Terms into a token stream. */
-  private static class ArrayTokens extends TokenStream {
-    private Term[] terms;
-    private int index;
-    private final TermAttribute termAttr;
-    private final PositionIncrementAttribute posAttr;
-    private final OffsetAttribute offsetAttr;
-
-    public ArrayTokens(Phrase phrase) {
-      this.terms = phrase.getTerms();
-      this.termAttr = addAttribute(TermAttribute.class);
-      this.posAttr = addAttribute(PositionIncrementAttribute.class);
-      this.offsetAttr = addAttribute(OffsetAttribute.class);
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      if (index == terms.length)
-        return false;
-
-      clearAttributes();
-      termAttr.setTermBuffer(terms[index].toString());
-      posAttr.setPositionIncrement(1);
-      offsetAttr.setOffset(index, ++index);
-      return true;
-    }
-  }
-
-  /** Optimizes phrase queries to use n-grams when possible. */
-  public String[] optimizePhrase(Phrase phrase, String field) {
-    
-    if (LOG.isTraceEnabled()) {
-      LOG.trace("Optimizing " + phrase + " for " + field);
-    }
-    ArrayList<String> result = new ArrayList<String>();
-    TokenStream ts = getFilter(new ArrayTokens(phrase), field);
-    String prev = null;
-    TermAttribute ta = ts.getAttribute(TermAttribute.class);
-    PositionIncrementAttribute pa = ts.getAttribute(PositionIncrementAttribute.class);
-    int position = 0;
-    try {
-      while (ts.incrementToken()) {
-        if (pa.getPositionIncrement() != 0 && prev != null)
-          result.add(prev);
-        prev = ta.term();
-        position += pa.getPositionIncrement();
-        if ((position + arity(ta.term())) == phrase.getTerms().length)
-          break;
-      }
-    } catch (IOException e) {
-      throw new RuntimeException(e.toString());
-    }
-    if (prev != null)
-      result.add(prev);
-
-    return result.toArray(new String[result.size()]);
-  }
-
-  private int arity(String gram) {
-    int index = 0;
-    int arity = 0;
-    while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
-      arity++;
-    }
-    return arity;
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    StringBuffer text = new StringBuffer();
-    for (int i = 0; i < args.length; i++) {
-      text.append(args[i]);
-      text.append(' ');
-    }
-    TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
-    CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
-    ts = commonGrams.getFilter(ts, "url");
-    TermAttribute ta = ts.getAttribute(TermAttribute.class);
-    OffsetAttribute oa = ts.getAttribute(OffsetAttribute.class);
-    PositionIncrementAttribute pia = ts.getAttribute(PositionIncrementAttribute.class);
-    while (ts.incrementToken()) {
-      System.out.println("Token: " + ta.term() + " offs:" + oa.startOffset() + "-" + oa.endOffset()
-          + " incr: " + pia.getPositionIncrement());
-    }
-    String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
-    System.out.print("Optimized: ");
-    for (int i = 0; i < optimized.length; i++) {
-      System.out.print(optimized[i] + " ");
-    }
-    System.out.println();
-  }
-  
-}
Index: src/java/org/apache/nutch/analysis/NutchAnalysis.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchAnalysis.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchAnalysis.java	(working copy)
@@ -1,957 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. NutchAnalysis.java */
-package org.apache.nutch.analysis;
-
-import java.io.StringReader;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.QueryFilters;
-import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-
-import java.io.*;
-import java.util.*;
-
-/** The JavaCC-generated Nutch lexical analyzer and query parser. */
-public class NutchAnalysis implements NutchAnalysisConstants {
-
-  private static final String[] STOP_WORDS = {
-    "a", "and", "are", "as", "at", "be", "but", "by",
-    "for", "if", "in", "into", "is", "it",
-    "no", "not", "of", "on", "or", "s", "such",
-    "t", "that", "the", "their", "then", "there", "these",
-    "they", "this", "to", "was", "will", "with"
-  };
-
-  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);
-
-  private Analyzer analyzer = null;
-  private String queryString;
-  private QueryFilters queryFilters;
-
-
-  /** Constructs a nutch analysis. */
-  public NutchAnalysis(String query, Analyzer analyzer) {
-    this(new FastCharStream(new StringReader(query)));
-    this.analyzer = analyzer;
-  }
-
-  /** True iff word is a stop word.  Stop words are only removed from queries.
-   * Every word is indexed.  */
-  public static boolean isStopWord(String word) {
-    return STOP_SET.contains(word);
-  }
-
-  /** Construct a query parser for the text in a reader. */
-  public static Query parseQuery(String queryString, Configuration conf) throws IOException {
-    return parseQuery(queryString, null, conf);
-  }
-
-  /** Construct a query parser for the text in a reader. */
-  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
-    throws IOException {
-    NutchAnalysis parser = new NutchAnalysis(
-          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
-    parser.queryString = queryString;
-    parser.queryFilters = new QueryFilters(conf);
-    return parser.parse(conf);
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-    while (true) {
-      System.out.print("Query: ");
-      String line = in.readLine();
-      System.out.println(parseQuery(line, NutchConfiguration.create()));
-    }
-  }
-
-/** Parse a query. */
-  final public Query parse(Configuration conf) throws ParseException {
-  Query query = new Query(conf);
-  ArrayList terms;
-  Token token;
-  String field;
-  boolean stop;
-  boolean prohibited;
-    nonOpOrTerm();
-    label_1:
-    while (true) {
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case WORD:
-      case ACRONYM:
-      case SIGRAM:
-      case PLUS:
-      case MINUS:
-      case QUOTE:
-        ;
-        break;
-      default:
-        jj_la1[0] = jj_gen;
-        break label_1;
-      }
-      stop=true; prohibited=false; field = Clause.DEFAULT_FIELD;
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case PLUS:
-      case MINUS:
-        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-        case PLUS:
-          jj_consume_token(PLUS);
-              stop=false;
-          break;
-        case MINUS:
-          jj_consume_token(MINUS);
-                                        stop=false;prohibited=true;
-          break;
-        default:
-          jj_la1[1] = jj_gen;
-          jj_consume_token(-1);
-          throw new ParseException();
-        }
-        break;
-      default:
-        jj_la1[2] = jj_gen;
-        ;
-      }
-      if (jj_2_1(2147483647)) {
-        token = jj_consume_token(WORD);
-        jj_consume_token(COLON);
-                             field = token.image;
-      } else {
-        ;
-      }
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case QUOTE:
-        terms = phrase(field);
-                           stop=false;
-        break;
-      case WORD:
-      case ACRONYM:
-      case SIGRAM:
-        // quoted terms or
-              terms = compound(field);
-        break;
-      default:
-        jj_la1[3] = jj_gen;
-        jj_consume_token(-1);
-        throw new ParseException();
-      }
-      nonOpOrTerm();
-      String[] array = (String[])terms.toArray(new String[terms.size()]);
-
-      if (stop
-          && field == Clause.DEFAULT_FIELD
-          && terms.size()==1
-          && isStopWord(array[0])) {
-        // ignore stop words only when single, unadorned terms in default field
-      } else {
-        if (prohibited)
-          query.addProhibitedPhrase(array, field);
-        else
-          query.addRequiredPhrase(array, field);
-      }
-    }
-    {if (true) return query;}
-    throw new Error("Missing return statement in function");
-  }
-
-/** Parse an explcitly quoted phrase query.  Note that this may return a single
- * term, a trivial phrase.*/
-  final public ArrayList phrase(String field) throws ParseException {
-  int start;
-  int end;
-  ArrayList result = new ArrayList();
-  String term;
-    jj_consume_token(QUOTE);
-    start = token.endColumn;
-    label_2:
-    while (true) {
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case PLUS:
-      case MINUS:
-      case COLON:
-      case SLASH:
-      case DOT:
-      case ATSIGN:
-      case APOSTROPHE:
-      case WHITE:
-        ;
-        break;
-      default:
-        jj_la1[4] = jj_gen;
-        break label_2;
-      }
-      nonTerm();
-    }
-    label_3:
-    while (true) {
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case WORD:
-      case ACRONYM:
-      case SIGRAM:
-        ;
-        break;
-      default:
-        jj_la1[5] = jj_gen;
-        break label_3;
-      }
-      term = term();
-                    result.add(term);
-      label_4:
-      while (true) {
-        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-        case PLUS:
-        case MINUS:
-        case COLON:
-        case SLASH:
-        case DOT:
-        case ATSIGN:
-        case APOSTROPHE:
-        case WHITE:
-          ;
-          break;
-        default:
-          jj_la1[6] = jj_gen;
-          break label_4;
-        }
-        nonTerm();
-      }
-    }
-    end = token.endColumn;
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case QUOTE:
-      jj_consume_token(QUOTE);
-      break;
-    case 0:
-      jj_consume_token(0);
-      break;
-    default:
-      jj_la1[7] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-    if (this.queryFilters.isRawField(field)) {
-      result.clear();
-      result.add(queryString.substring(start, end));
-    }
-    {if (true) return result;}
-    throw new Error("Missing return statement in function");
-  }
-
-/** Parse a compound term that is interpreted as an implicit phrase query.
- * Compounds are a sequence of terms separated by infix characters.  Note that
- * this may return a single term, a trivial compound. */
-  final public ArrayList compound(String field) throws ParseException {
-  int start;
-  ArrayList result = new ArrayList();
-  String term;
-  StringBuffer terms = new StringBuffer();
-    start = token.endColumn;
-    term = term();
-    terms.append(term).append(" ");
-    //result.add(term);
-
-    label_5:
-    while (true) {
-      if (jj_2_2(2147483647)) {
-        ;
-      } else {
-        break label_5;
-      }
-      label_6:
-      while (true) {
-        infix();
-        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-        case PLUS:
-        case MINUS:
-        case COLON:
-        case SLASH:
-        case DOT:
-        case ATSIGN:
-        case APOSTROPHE:
-          ;
-          break;
-        default:
-          jj_la1[8] = jj_gen;
-          break label_6;
-        }
-      }
-      term = term();
-      terms.append(term).append(" ");
-      //result.add(term);
-
-    }
-    if (this.queryFilters.isRawField(field)) {
-//      result.clear();
-      result.add(queryString.substring(start, token.endColumn));
-
-    } else {
-      TokenStream tokens = analyzer.tokenStream(
-                              field, new StringReader(terms.toString()));
-
-      TermAttribute ta = tokens.getAttribute(TermAttribute.class);
-      try
-      {
-        String termText;
-        while (tokens.incrementToken())
-        {
-          if ((termText = ta.term()) == null)
-            break;
-          result.add(termText);
-        }
-      } catch (IOException e) {
-        // ignore (?)
-      }
-//
-      try {
-        tokens.close();
-      } catch (IOException e) {
-        // ignore
-      }
-    }
-    {if (true) return result;}
-    throw new Error("Missing return statement in function");
-  }
-
-/** Parse a single term. */
-  final public String term() throws ParseException {
-  Token token;
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case WORD:
-      token = jj_consume_token(WORD);
-      break;
-    case ACRONYM:
-      token = jj_consume_token(ACRONYM);
-      break;
-    case SIGRAM:
-      token = jj_consume_token(SIGRAM);
-      break;
-    default:
-      jj_la1[9] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-    {if (true) return token.image;}
-    throw new Error("Missing return statement in function");
-  }
-
-/** Parse anything but a term or a quote. */
-  final public void nonTerm() throws ParseException {
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case WHITE:
-      jj_consume_token(WHITE);
-      break;
-    case PLUS:
-    case MINUS:
-    case COLON:
-    case SLASH:
-    case DOT:
-    case ATSIGN:
-    case APOSTROPHE:
-      infix();
-      break;
-    default:
-      jj_la1[10] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-  }
-
-  final public void nonTermOrEOF() throws ParseException {
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case PLUS:
-    case MINUS:
-    case COLON:
-    case SLASH:
-    case DOT:
-    case ATSIGN:
-    case APOSTROPHE:
-    case WHITE:
-      nonTerm();
-      break;
-    case 0:
-      jj_consume_token(0);
-      break;
-    default:
-      jj_la1[11] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-  }
-
-/** Parse anything but a term or an operator (plur or minus or quote). */
-  final public void nonOpOrTerm() throws ParseException {
-    label_7:
-    while (true) {
-      if (jj_2_3(2)) {
-        ;
-      } else {
-        break label_7;
-      }
-      switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-      case WHITE:
-        jj_consume_token(WHITE);
-        break;
-      case COLON:
-      case SLASH:
-      case DOT:
-      case ATSIGN:
-      case APOSTROPHE:
-        nonOpInfix();
-        break;
-      case PLUS:
-      case MINUS:
-        switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-        case PLUS:
-          jj_consume_token(PLUS);
-          break;
-        case MINUS:
-          jj_consume_token(MINUS);
-          break;
-        default:
-          jj_la1[12] = jj_gen;
-          jj_consume_token(-1);
-          throw new ParseException();
-        }
-        nonTermOrEOF();
-        break;
-      default:
-        jj_la1[13] = jj_gen;
-        jj_consume_token(-1);
-        throw new ParseException();
-      }
-    }
-  }
-
-/** Characters which can be used to form compound terms. */
-  final public void infix() throws ParseException {
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case PLUS:
-      jj_consume_token(PLUS);
-      break;
-    case MINUS:
-      jj_consume_token(MINUS);
-      break;
-    case COLON:
-    case SLASH:
-    case DOT:
-    case ATSIGN:
-    case APOSTROPHE:
-      nonOpInfix();
-      break;
-    default:
-      jj_la1[14] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-  }
-
-/** Parse infix characters except plus and minus. */
-  final public void nonOpInfix() throws ParseException {
-    switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
-    case COLON:
-      jj_consume_token(COLON);
-      break;
-    case SLASH:
-      jj_consume_token(SLASH);
-      break;
-    case DOT:
-      jj_consume_token(DOT);
-      break;
-    case ATSIGN:
-      jj_consume_token(ATSIGN);
-      break;
-    case APOSTROPHE:
-      jj_consume_token(APOSTROPHE);
-      break;
-    default:
-      jj_la1[15] = jj_gen;
-      jj_consume_token(-1);
-      throw new ParseException();
-    }
-  }
-
-  private boolean jj_2_1(int xla) {
-    jj_la = xla; jj_lastpos = jj_scanpos = token;
-    try { return !jj_3_1(); }
-    catch(LookaheadSuccess ls) { return true; }
-    finally { jj_save(0, xla); }
-  }
-
-  private boolean jj_2_2(int xla) {
-    jj_la = xla; jj_lastpos = jj_scanpos = token;
-    try { return !jj_3_2(); }
-    catch(LookaheadSuccess ls) { return true; }
-    finally { jj_save(1, xla); }
-  }
-
-  private boolean jj_2_3(int xla) {
-    jj_la = xla; jj_lastpos = jj_scanpos = token;
-    try { return !jj_3_3(); }
-    catch(LookaheadSuccess ls) { return true; }
-    finally { jj_save(2, xla); }
-  }
-
-  private boolean jj_3_3() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(15)) {
-    jj_scanpos = xsp;
-    if (jj_3R_12()) {
-    jj_scanpos = xsp;
-    if (jj_3R_13()) return true;
-    }
-    }
-    return false;
-  }
-
-  private boolean jj_3R_27() {
-    if (jj_3R_16()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_25() {
-    if (jj_3R_24()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_23() {
-    if (jj_3R_24()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_18() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_3R_23()) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(0)) return true;
-    }
-    return false;
-  }
-
-  private boolean jj_3R_13() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(7)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(8)) return true;
-    }
-    if (jj_3R_18()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_20() {
-    if (jj_3R_11()) return true;
-    Token xsp;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_25()) { jj_scanpos = xsp; break; }
-    }
-    return false;
-  }
-
-  private boolean jj_3R_10() {
-    if (jj_3R_16()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_19() {
-    if (jj_3R_24()) return true;
-    return false;
-  }
-
-  private boolean jj_3_2() {
-    Token xsp;
-    if (jj_3R_10()) return true;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_10()) { jj_scanpos = xsp; break; }
-    }
-    if (jj_3R_11()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_9() {
-    if (jj_3R_15()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_24() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(15)) {
-    jj_scanpos = xsp;
-    if (jj_3R_27()) return true;
-    }
-    return false;
-  }
-
-  private boolean jj_3R_14() {
-    if (jj_scan_token(QUOTE)) return true;
-    Token xsp;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_19()) { jj_scanpos = xsp; break; }
-    }
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_20()) { jj_scanpos = xsp; break; }
-    }
-    xsp = jj_scanpos;
-    if (jj_scan_token(9)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(0)) return true;
-    }
-    return false;
-  }
-
-  private boolean jj_3R_26() {
-    if (jj_3R_16()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_22() {
-    if (jj_3R_17()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_21() {
-    Token xsp;
-    if (jj_3R_26()) return true;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_26()) { jj_scanpos = xsp; break; }
-    }
-    if (jj_3R_11()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_12() {
-    if (jj_3R_17()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_11() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(1)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(2)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(3)) return true;
-    }
-    }
-    return false;
-  }
-
-  private boolean jj_3R_8() {
-    if (jj_3R_14()) return true;
-    return false;
-  }
-
-  private boolean jj_3R_15() {
-    if (jj_3R_11()) return true;
-    Token xsp;
-    while (true) {
-      xsp = jj_scanpos;
-      if (jj_3R_21()) { jj_scanpos = xsp; break; }
-    }
-    return false;
-  }
-
-  private boolean jj_3R_17() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(10)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(11)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(12)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(13)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(14)) return true;
-    }
-    }
-    }
-    }
-    return false;
-  }
-
-  private boolean jj_3_1() {
-    if (jj_scan_token(WORD)) return true;
-    if (jj_scan_token(COLON)) return true;
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_3R_8()) {
-    jj_scanpos = xsp;
-    if (jj_3R_9()) return true;
-    }
-    return false;
-  }
-
-  private boolean jj_3R_16() {
-    Token xsp;
-    xsp = jj_scanpos;
-    if (jj_scan_token(7)) {
-    jj_scanpos = xsp;
-    if (jj_scan_token(8)) {
-    jj_scanpos = xsp;
-    if (jj_3R_22()) return true;
-    }
-    }
-    return false;
-  }
-
-  /** Generated Token Manager. */
-  public NutchAnalysisTokenManager token_source;
-  /** Current token. */
-  public Token token;
-  /** Next token. */
-  public Token jj_nt;
-  private int jj_ntk;
-  private Token jj_scanpos, jj_lastpos;
-  private int jj_la;
-  private int jj_gen;
-  final private int[] jj_la1 = new int[16];
-  static private int[] jj_la1_0;
-  static {
-      jj_la1_init_0();
-   }
-   private static void jj_la1_init_0() {
-      jj_la1_0 = new int[] {0x38e,0x180,0x180,0x20e,0xfd80,0xe,0xfd80,0x201,0x7d80,0xe,0xfd80,0xfd81,0x180,0xfd80,0x7d80,0x7c00,};
-   }
-  final private JJCalls[] jj_2_rtns = new JJCalls[3];
-  private boolean jj_rescan = false;
-  private int jj_gc = 0;
-
-  /** Constructor with user supplied CharStream. */
-  public NutchAnalysis(CharStream stream) {
-    token_source = new NutchAnalysisTokenManager(stream);
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  /** Reinitialise. */
-  public void ReInit(CharStream stream) {
-    token_source.ReInit(stream);
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  /** Constructor with generated Token Manager. */
-  public NutchAnalysis(NutchAnalysisTokenManager tm) {
-    token_source = tm;
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  /** Reinitialise. */
-  public void ReInit(NutchAnalysisTokenManager tm) {
-    token_source = tm;
-    token = new Token();
-    jj_ntk = -1;
-    jj_gen = 0;
-    for (int i = 0; i < 16; i++) jj_la1[i] = -1;
-    for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
-  }
-
-  private Token jj_consume_token(int kind) throws ParseException {
-    Token oldToken;
-    if ((oldToken = token).next != null) token = token.next;
-    else token = token.next = token_source.getNextToken();
-    jj_ntk = -1;
-    if (token.kind == kind) {
-      jj_gen++;
-      if (++jj_gc > 100) {
-        jj_gc = 0;
-        for (int i = 0; i < jj_2_rtns.length; i++) {
-          JJCalls c = jj_2_rtns[i];
-          while (c != null) {
-            if (c.gen < jj_gen) c.first = null;
-            c = c.next;
-          }
-        }
-      }
-      return token;
-    }
-    token = oldToken;
-    jj_kind = kind;
-    throw generateParseException();
-  }
-
-  static private final class LookaheadSuccess extends java.lang.Error { }
-  final private LookaheadSuccess jj_ls = new LookaheadSuccess();
-  private boolean jj_scan_token(int kind) {
-    if (jj_scanpos == jj_lastpos) {
-      jj_la--;
-      if (jj_scanpos.next == null) {
-        jj_lastpos = jj_scanpos = jj_scanpos.next = token_source.getNextToken();
-      } else {
-        jj_lastpos = jj_scanpos = jj_scanpos.next;
-      }
-    } else {
-      jj_scanpos = jj_scanpos.next;
-    }
-    if (jj_rescan) {
-      int i = 0; Token tok = token;
-      while (tok != null && tok != jj_scanpos) { i++; tok = tok.next; }
-      if (tok != null) jj_add_error_token(kind, i);
-    }
-    if (jj_scanpos.kind != kind) return true;
-    if (jj_la == 0 && jj_scanpos == jj_lastpos) throw jj_ls;
-    return false;
-  }
-
-
-/** Get the next Token. */
-  final public Token getNextToken() {
-    if (token.next != null) token = token.next;
-    else token = token.next = token_source.getNextToken();
-    jj_ntk = -1;
-    jj_gen++;
-    return token;
-  }
-
-/** Get the specific Token. */
-  final public Token getToken(int index) {
-    Token t = token;
-    for (int i = 0; i < index; i++) {
-      if (t.next != null) t = t.next;
-      else t = t.next = token_source.getNextToken();
-    }
-    return t;
-  }
-
-  private int jj_ntk() {
-    if ((jj_nt=token.next) == null)
-      return (jj_ntk = (token.next=token_source.getNextToken()).kind);
-    else
-      return (jj_ntk = jj_nt.kind);
-  }
-
-  private java.util.List jj_expentries = new java.util.ArrayList();
-  private int[] jj_expentry;
-  private int jj_kind = -1;
-  private int[] jj_lasttokens = new int[100];
-  private int jj_endpos;
-
-  private void jj_add_error_token(int kind, int pos) {
-    if (pos >= 100) return;
-    if (pos == jj_endpos + 1) {
-      jj_lasttokens[jj_endpos++] = kind;
-    } else if (jj_endpos != 0) {
-      jj_expentry = new int[jj_endpos];
-      for (int i = 0; i < jj_endpos; i++) {
-        jj_expentry[i] = jj_lasttokens[i];
-      }
-      jj_entries_loop: for (java.util.Iterator it = jj_expentries.iterator(); it.hasNext();) {
-        int[] oldentry = (int[])(it.next());
-        if (oldentry.length == jj_expentry.length) {
-          for (int i = 0; i < jj_expentry.length; i++) {
-            if (oldentry[i] != jj_expentry[i]) {
-              continue jj_entries_loop;
-            }
-          }
-          jj_expentries.add(jj_expentry);
-          break jj_entries_loop;
-        }
-      }
-      if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
-    }
-  }
-
-  /** Generate ParseException. */
-  public ParseException generateParseException() {
-    jj_expentries.clear();
-    boolean[] la1tokens = new boolean[20];
-    if (jj_kind >= 0) {
-      la1tokens[jj_kind] = true;
-      jj_kind = -1;
-    }
-    for (int i = 0; i < 16; i++) {
-      if (jj_la1[i] == jj_gen) {
-        for (int j = 0; j < 32; j++) {
-          if ((jj_la1_0[i] & (1<<j)) != 0) {
-            la1tokens[j] = true;
-          }
-        }
-      }
-    }
-    for (int i = 0; i < 20; i++) {
-      if (la1tokens[i]) {
-        jj_expentry = new int[1];
-        jj_expentry[0] = i;
-        jj_expentries.add(jj_expentry);
-      }
-    }
-    jj_endpos = 0;
-    jj_rescan_token();
-    jj_add_error_token(0, 0);
-    int[][] exptokseq = new int[jj_expentries.size()][];
-    for (int i = 0; i < jj_expentries.size(); i++) {
-      exptokseq[i] = (int[])jj_expentries.get(i);
-    }
-    return new ParseException(token, exptokseq, tokenImage);
-  }
-
-  /** Enable tracing. */
-  final public void enable_tracing() {
-  }
-
-  /** Disable tracing. */
-  final public void disable_tracing() {
-  }
-
-  private void jj_rescan_token() {
-    jj_rescan = true;
-    for (int i = 0; i < 3; i++) {
-    try {
-      JJCalls p = jj_2_rtns[i];
-      do {
-        if (p.gen > jj_gen) {
-          jj_la = p.arg; jj_lastpos = jj_scanpos = p.first;
-          switch (i) {
-            case 0: jj_3_1(); break;
-            case 1: jj_3_2(); break;
-            case 2: jj_3_3(); break;
-          }
-        }
-        p = p.next;
-      } while (p != null);
-      } catch(LookaheadSuccess ls) { }
-    }
-    jj_rescan = false;
-  }
-
-  private void jj_save(int index, int xla) {
-    JJCalls p = jj_2_rtns[index];
-    while (p.gen > jj_gen) {
-      if (p.next == null) { p = p.next = new JJCalls(); break; }
-      p = p.next;
-    }
-    p.gen = jj_gen + xla - jj_la; p.first = token; p.arg = xla;
-  }
-
-  static final class JJCalls {
-    int gen;
-    Token first;
-    int arg;
-    JJCalls next;
-  }
-
-}
Index: src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java	(working copy)
@@ -1,111 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.analysis;
-
-// JDK imports
-import java.io.Reader;
-import java.io.IOException;
-
-// Lucene imports
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
- * analyzer {@link NutchDocumentTokenizer}, with no stop list. This keeps it
- * consistent with query parsing.
- */
-public class NutchDocumentAnalyzer extends NutchAnalyzer {
-
-  /** Analyzer used to index textual content. */
-  private static Analyzer CONTENT_ANALYZER;
-  // Anchor Analysis
-  // Like content analysis, but leave gap between anchors to inhibit
-  // cross-anchor phrase matching.
-  /**
-   * The number of unused term positions between anchors in the anchor field.
-   */
-  public static final int INTER_ANCHOR_GAP = 4;
-  /** Analyzer used to analyze anchors. */
-  private static Analyzer ANCHOR_ANALYZER;
-
-  /**
-   * @param conf
-   */
-  public NutchDocumentAnalyzer(Configuration conf) {
-    this.conf = conf;
-    CONTENT_ANALYZER = new ContentAnalyzer(conf);
-    ANCHOR_ANALYZER = new AnchorAnalyzer();
-  }
-
-  /** Analyzer used to index textual content. */
-  private static class ContentAnalyzer extends Analyzer {
-    private CommonGrams commonGrams;
-
-    public ContentAnalyzer(Configuration conf) {
-      this.commonGrams = new CommonGrams(conf);
-    }
-
-    /** Constructs a {@link NutchDocumentTokenizer}. */
-    public TokenStream tokenStream(String field, Reader reader) {
-      return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),
-          field);
-    }
-  }
-
-  private static class AnchorFilter extends TokenFilter {
-    private final PositionIncrementAttribute posAttr;
-    private boolean first = true;
-
-    public AnchorFilter(TokenStream input) {
-      super(input);
-      // The super filter must have positional information.
-      posAttr = input.getAttribute(PositionIncrementAttribute.class);
-    }
-
-    public boolean incrementToken() throws IOException {
-      boolean hasNext = input.incrementToken(); 
-      if (hasNext) {
-        if (first) {
-          posAttr.setPositionIncrement(INTER_ANCHOR_GAP);
-          first = false;
-        }
-      }
-      return false;
-    }
-  }
-
-  private static class AnchorAnalyzer extends Analyzer {
-    public final TokenStream tokenStream(String fieldName, Reader reader) {
-      return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
-    }
-  }
-
-  /** Returns a new token stream for text from the named field. */
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    Analyzer analyzer;
-    if ("anchor".equals(fieldName))
-      analyzer = ANCHOR_ANALYZER;
-    else
-      analyzer = CONTENT_ANALYZER;
-
-    return analyzer.tokenStream(fieldName, reader);
-  }
-}
Index: src/java/org/apache/nutch/analysis/NutchAnalyzer.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchAnalyzer.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/NutchAnalyzer.java	(working copy)
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.analysis;
-
-// JDK imports
-import java.io.Reader;
-
-// Lucene imports
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.plugin.Pluggable;
-
-
-/** 
- * Extension point for analysis.
- * All plugins found which implement this extension point are run
- * sequentially on the parse.
- *
- * @author J&eacute;r&ocirc;me Charron
- */
-public abstract class NutchAnalyzer extends Analyzer
-                                    implements Configurable, Pluggable {
-
-  /** The name of the extension point. */
-  final static String X_POINT_ID = NutchAnalyzer.class.getName();
-
-  /** The current Configuration */
-  protected Configuration conf = null;
-
-  
-  /**
-   * Creates a TokenStream which tokenizes all the text in the provided Reader.
-   */
-  public abstract TokenStream tokenStream(String fieldName, Reader reader);
-
-
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-
-  // Inherited Javadoc
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  // Inherited Javadoc
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-
-}
Index: src/java/org/apache/nutch/analysis/CharStream.java
===================================================================
--- src/java/org/apache/nutch/analysis/CharStream.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/CharStream.java	(working copy)
@@ -1,110 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 3.0 */
-package org.apache.nutch.analysis;
-
-/**
- * This interface describes a character stream that maintains line and
- * column number positions of the characters.  It also has the capability
- * to backup the stream to some extent.  An implementation of this
- * interface is used in the TokenManager implementation generated by
- * JavaCCParser.
- *
- * All the methods except backup can be implemented in any fashion. backup
- * needs to be implemented correctly for the correct operation of the lexer.
- * Rest of the methods are all used to get information like line number,
- * column number and the String that constitutes a token and are not used
- * by the lexer. Hence their implementation won't affect the generated lexer's
- * operation.
- */
-
-interface CharStream {
-
-  /**
-   * Returns the next character from the selected input.  The method
-   * of selecting the input is the responsibility of the class
-   * implementing this interface.  Can throw any java.io.IOException.
-   */
-  char readChar() throws java.io.IOException;
-
-  /**
-   * Returns the column position of the character last read.
-   * @deprecated 
-   * @see #getEndColumn
-   */
-  int getColumn();
-
-  /**
-   * Returns the line number of the character last read.
-   * @deprecated 
-   * @see #getEndLine
-   */
-  int getLine();
-
-  /**
-   * Returns the column number of the last character for current token (being
-   * matched after the last call to BeginTOken).
-   */
-  int getEndColumn();
-
-  /**
-   * Returns the line number of the last character for current token (being
-   * matched after the last call to BeginTOken).
-   */
-  int getEndLine();
-
-  /**
-   * Returns the column number of the first character for current token (being
-   * matched after the last call to BeginTOken).
-   */
-  int getBeginColumn();
-
-  /**
-   * Returns the line number of the first character for current token (being
-   * matched after the last call to BeginTOken).
-   */
-  int getBeginLine();
-
-  /**
-   * Backs up the input stream by amount steps. Lexer calls this method if it
-   * had already read some characters, but could not use them to match a
-   * (longer) token. So, they will be used again as the prefix of the next
-   * token and it is the implemetation's responsibility to do this right.
-   */
-  void backup(int amount);
-
-  /**
-   * Returns the next character that marks the beginning of the next token.
-   * All characters must remain in the buffer between two successive calls
-   * to this method to implement backup correctly.
-   */
-  char BeginToken() throws java.io.IOException;
-
-  /**
-   * Returns a string made up of characters from the marked token beginning 
-   * to the current buffer position. Implementations have the choice of returning
-   * anything that they want to. For example, for efficiency, one might decide
-   * to just return null, which is a valid implementation.
-   */
-  String GetImage();
-
-  /**
-   * Returns an array of characters that make up the suffix of length 'len' for
-   * the currently matched token. This is used to build up the matched string
-   * for use in actions in the case of MORE. A simple and inefficient
-   * implementation of this is as follows :
-   *
-   *   {
-   *      String t = GetImage();
-   *      return t.substring(t.length() - len, t.length()).toCharArray();
-   *   }
-   */
-  char[] GetSuffix(int len);
-
-  /**
-   * The lexer calls this function to indicate that it is done with the stream
-   * and hence implementations can free any resources held by this class.
-   * Again, the body of this function can be just empty and it will not
-   * affect the lexer's operation.
-   */
-  void Done();
-
-}
Index: src/java/org/apache/nutch/analysis/ParseException.java
===================================================================
--- src/java/org/apache/nutch/analysis/ParseException.java	(revision 959954)
+++ src/java/org/apache/nutch/analysis/ParseException.java	(working copy)
@@ -1,197 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 4.1 */
-package org.apache.nutch.analysis;
-
-/**
- * This exception is thrown when parse errors are encountered.
- * You can explicitly create objects of this exception type by
- * calling the method generateParseException in the generated
- * parser.
- *
- * You can modify this class to customize your error reporting
- * mechanisms so long as you retain the public fields.
- */
-@SuppressWarnings("serial")
-public class ParseException extends java.io.IOException {
-
-  /**
-   * This constructor is used by the method "generateParseException"
-   * in the generated parser.  Calling this constructor generates
-   * a new object of this type with the fields "currentToken",
-   * "expectedTokenSequences", and "tokenImage" set.  The boolean
-   * flag "specialConstructor" is also set to true to indicate that
-   * this constructor was used to create this object.
-   * This constructor calls its super class with the empty string
-   * to force the "toString" method of parent class "Throwable" to
-   * print the error message in the form:
-   *     ParseException: <result of getMessage>
-   */
-  public ParseException(Token currentTokenVal,
-                        int[][] expectedTokenSequencesVal,
-                        String[] tokenImageVal
-                       )
-  {
-    super("");
-    specialConstructor = true;
-    currentToken = currentTokenVal;
-    expectedTokenSequences = expectedTokenSequencesVal;
-    tokenImage = tokenImageVal;
-  }
-
-  /**
-   * The following constructors are for use by you for whatever
-   * purpose you can think of.  Constructing the exception in this
-   * manner makes the exception behave in the normal way - i.e., as
-   * documented in the class "Throwable".  The fields "errorToken",
-   * "expectedTokenSequences", and "tokenImage" do not contain
-   * relevant information.  The JavaCC generated code does not use
-   * these constructors.
-   */
-
-  public ParseException() {
-    super();
-    specialConstructor = false;
-  }
-
-  /** Constructor with message. */
-  public ParseException(String message) {
-    super(message);
-    specialConstructor = false;
-  }
-
-  /**
-   * This variable determines which constructor was used to create
-   * this object and thereby affects the semantics of the
-   * "getMessage" method (see below).
-   */
-  protected boolean specialConstructor;
-
-  /**
-   * This is the last token that has been consumed successfully.  If
-   * this object has been created due to a parse error, the token
-   * followng this token will (therefore) be the first error token.
-   */
-  public Token currentToken;
-
-  /**
-   * Each entry in this array is an array of integers.  Each array
-   * of integers represents a sequence of tokens (by their ordinal
-   * values) that is expected at this point of the parse.
-   */
-  public int[][] expectedTokenSequences;
-
-  /**
-   * This is a reference to the "tokenImage" array of the generated
-   * parser within which the parse error occurred.  This array is
-   * defined in the generated ...Constants interface.
-   */
-  public String[] tokenImage;
-
-  /**
-   * This method has the standard behavior when this object has been
-   * created using the standard constructors.  Otherwise, it uses
-   * "currentToken" and "expectedTokenSequences" to generate a parse
-   * error message and returns it.  If this object has been created
-   * due to a parse error, and you do not catch it (it gets thrown
-   * from the parser), then this method is called during the printing
-   * of the final stack trace, and hence the correct error message
-   * gets displayed.
-   */
-  public String getMessage() {
-    if (!specialConstructor) {
-      return super.getMessage();
-    }
-    StringBuffer expected = new StringBuffer();
-    int maxSize = 0;
-    for (int i = 0; i < expectedTokenSequences.length; i++) {
-      if (maxSize < expectedTokenSequences[i].length) {
-        maxSize = expectedTokenSequences[i].length;
-      }
-      for (int j = 0; j < expectedTokenSequences[i].length; j++) {
-        expected.append(tokenImage[expectedTokenSequences[i][j]]).append(' ');
-      }
-      if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
-        expected.append("...");
-      }
-      expected.append(eol).append("    ");
-    }
-    String retval = "Encountered \"";
-    Token tok = currentToken.next;
-    for (int i = 0; i < maxSize; i++) {
-      if (i != 0) retval += " ";
-      if (tok.kind == 0) {
-        retval += tokenImage[0];
-        break;
-      }
-      retval += " " + tokenImage[tok.kind];
-      retval += " \"";
-      retval += add_escapes(tok.image);
-      retval += " \"";
-      tok = tok.next; 
-    }
-    retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn;
-    retval += "." + eol;
-    if (expectedTokenSequences.length == 1) {
-      retval += "Was expecting:" + eol + "    ";
-    } else {
-      retval += "Was expecting one of:" + eol + "    ";
-    }
-    retval += expected.toString();
-    return retval;
-  }
-
-  /**
-   * The end of line string for this machine.
-   */
-  protected String eol = System.getProperty("line.separator", "\n");
- 
-  /**
-   * Used to convert raw characters to their escaped version
-   * when these raw version cannot be used as part of an ASCII
-   * string literal.
-   */
-  protected String add_escapes(String str) {
-      StringBuffer retval = new StringBuffer();
-      char ch;
-      for (int i = 0; i < str.length(); i++) {
-        switch (str.charAt(i))
-        {
-           case 0 :
-              continue;
-           case '\b':
-              retval.append("\\b");
-              continue;
-           case '\t':
-              retval.append("\\t");
-              continue;
-           case '\n':
-              retval.append("\\n");
-              continue;
-           case '\f':
-              retval.append("\\f");
-              continue;
-           case '\r':
-              retval.append("\\r");
-              continue;
-           case '\"':
-              retval.append("\\\"");
-              continue;
-           case '\'':
-              retval.append("\\\'");
-              continue;
-           case '\\':
-              retval.append("\\\\");
-              continue;
-           default:
-              if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
-                 String s = "0000" + Integer.toString(ch, 16);
-                 retval.append("\\u" + s.substring(s.length() - 4, s.length()));
-              } else {
-                 retval.append(ch);
-              }
-              continue;
-        }
-      }
-      return retval.toString();
-   }
-
-}
Index: src/java/org/apache/nutch/servlet/Cached.java
===================================================================
--- src/java/org/apache/nutch/servlet/Cached.java	(revision 959954)
+++ src/java/org/apache/nutch/servlet/Cached.java	(working copy)
@@ -1,113 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.servlet;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.hadoop.conf.Configuration;
-
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import java.io.OutputStream;
-import java.io.IOException;
-
-/**
- * A servlet that serves raw Content of any mime type.
- *
- * If it fails with java.lang.OutOfMemoryError,
- * you may have to increase heap size when starting container,
- * such as -Xms1024M -Xmx1024M
- *
- * @author John Xing
- */
-
-@SuppressWarnings("serial")
-public class Cached extends HttpServlet {
-
-  NutchBean bean = null;
-
-  public void init() {
-    init(NutchConfiguration.create());
-  }
-  
-  public void init(Configuration conf) {
-    try {
-      bean = NutchBean.get(this.getServletContext(), conf);
-    } catch (IOException e) {
-      // nothing
-    }
-  }
-
-  public void destroy() {
-    // maybe clean bean?
-    // nothing now
-  }
- 
-  public void doGet(HttpServletRequest request, HttpServletResponse response)
-    throws IOException {
-
-    // quit if no bean
-    if (bean == null)
-      return;
-
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("request from " + request.getRemoteAddr());
-    }
-
-    Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                      request.getParameter("id"));
-    HitDetails details = bean.getDetails(hit);
-
-    // raw bytes
-    byte[] bytes = bean.getContent(details);
-
-    // pass all original headers? only these for now.
-    Metadata metadata = bean.getParseData(details).getContentMeta();
-    String contentType = metadata.get(Response.CONTENT_TYPE);
-    //String lastModified = metadata.get(Metadata.LAST_MODIFIED);
-    //String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
-    // better use this, since it may have been truncated during fetch
-    // or give warning if they don't match?
-    int contentLength = bytes.length;
-
-    // response
-    response.setContentType(contentType);
-    response.setContentLength(contentLength);
-
-    OutputStream os = response.getOutputStream();
-    os.write(bytes);
-    // need this or flush more frequently?
-    //os.flush();
-    os.close();
-
-    return;
-  }
-
-  public void doPost(HttpServletRequest request, HttpServletResponse response)
-    throws IOException {
-    doGet(request, response);
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java	(working copy)
@@ -1,292 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.util.*;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.misc.ChainedFilter;
-import org.apache.lucene.search.*;
-import org.apache.lucene.search.Searcher;
-
-/** Utility which converts certain query clauses into {@link QueryFilter}s and
- * caches these.  Only required clauses whose boost is zero are converted to
- * cached filters.  Range queries are converted to range filters.  This
- * accellerates query constraints like date, language, document format, etc.,
- * which do not affect ranking but might otherwise slow search considerably. */
-class LuceneQueryOptimizer {
-
-  // This thread provides a pseudo-clock service to all searching
-  // threads, so that they can count elapsed time with less overhead than
-  // repeatedly calling System.currentTimeMillis.
-  private TimerThread timerThread = null;
-
-  private static class TimerThread extends Thread {
-    private int tick;
-    // NOTE: we can avoid explicit synchronization here for several reasons:
-    // * updates to 32-bit-sized variables are atomic
-    // * only single thread modifies this value
-    // * use of volatile keyword ensures that it does not reside in
-    //   a register, but in main memory (so that changes are visible to
-    //   other threads).
-    // * visibility of changes does not need to be instantanous, we can
-    //   afford losing a tick or two.
-    //
-    // See section 17 of the Java Language Specification for details.
-    public volatile int timeCounter = 0;
-
-    boolean running = true;
-
-    public TimerThread(int tick) {
-      super("LQO timer thread");
-      this.tick = tick;
-      this.setDaemon(true);
-    }
-
-    public void run() {
-      while(running) {
-        timeCounter++;
-        try {
-          Thread.sleep(tick);
-        } catch (InterruptedException ie) {};
-      }
-    }
-  }
-
-  private void initTimerThread(int p) {
-    if (timerThread == null || !timerThread.isAlive()) {
-      timerThread = new TimerThread(p);
-      timerThread.start();
-    }
-  }
-  
-
-  @SuppressWarnings("serial")
-  private static class TimeExceeded extends RuntimeException {
-    public long maxTime;
-    private int maxDoc;
-    public TimeExceeded(long maxTime, int maxDoc) {
-      super("Exceeded search time: " + maxTime + " ms.");
-      this.maxTime = maxTime;
-      this.maxDoc = maxDoc;
-    }
-  }
-
-  private static class LimitedCollector extends Collector {
-    private int maxHits;
-    private int maxTicks;
-    private int startTicks;
-    private TimerThread timer;
-    private int curTicks;
-    private TopDocsCollector<ScoreDoc> delegate;
-
-    public LimitedCollector(int numHits, int maxHits, int maxTicks,
-            TimerThread timer) {
-      final boolean docsScoredInOrder = true;
-      delegate = TopScoreDocCollector.create(numHits, docsScoredInOrder);
-      this.maxHits = maxHits;
-      this.maxTicks = maxTicks;
-      if (timer != null) {
-    	this.timer = timer;
-        this.startTicks = timer.timeCounter;
-      }
-    }
-
-    @Override
-    public boolean acceptsDocsOutOfOrder() {
-      return delegate.acceptsDocsOutOfOrder();
-    }
-
-    @Override
-    public void collect(int doc) throws IOException {
-      if (maxHits > 0 && delegate.getTotalHits() >= maxHits) {
-        throw new LimitExceeded(doc);
-      }
-      if (timer != null) {
-        curTicks = timer.timeCounter;
-        // overflow check
-        if (curTicks < startTicks) curTicks += Integer.MAX_VALUE;
-        if (curTicks - startTicks > maxTicks) {
-          throw new TimeExceeded(timer.tick * (curTicks - startTicks), doc);
-        }
-      }
-      delegate.collect(doc);
-    }
-
-    @Override
-    public void setNextReader(IndexReader r, int base)
-        throws IOException {
-      delegate.setNextReader(r, base);
-    }
-
-    @Override
-    public void setScorer(Scorer scorer) throws IOException {
-      delegate.setScorer(scorer);
-    }
-
-    public TopDocs topDocs() {
-      return delegate.topDocs();
-    }
-  }
-  
-  @SuppressWarnings("serial")
-private static class LimitExceeded extends RuntimeException {
-    private int maxDoc;
-    public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }    
-  }
-  
-  private LinkedHashMap<BooleanQuery, Filter> cache;                   // an LRU cache of QueryFilter
-  
-  private float threshold;
-
-  private int searcherMaxHits;
-
-  private int tickLength;
-
-  private int maxTickCount;
-  
-  /**
-   * Construct an optimizer that caches and uses filters for required clauses
-   * whose boost is zero.
-   * 
-   * @param cacheSize
-   *          the number of QueryFilters to cache
-   * @param threshold
-   *          the fraction of documents which must contain a term
-   */
-  @SuppressWarnings("serial")
-public LuceneQueryOptimizer(Configuration conf) {
-    final int cacheSize = conf.getInt("searcher.filter.cache.size", 16);
-    this.threshold = conf.getFloat("searcher.filter.cache.threshold",
-        0.05f);
-    this.searcherMaxHits = conf.getInt("searcher.max.hits", -1);
-    this.cache = new LinkedHashMap<BooleanQuery, Filter>(cacheSize, 0.75f, true) {
-      protected boolean removeEldestEntry(Map.Entry<BooleanQuery, Filter> eldest) {
-        return size() > cacheSize; // limit size of cache
-      }
-    };
-    this.tickLength = conf.getInt("searcher.max.time.tick_length", 200);
-    this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1);
-    if (this.maxTickCount > 0) {
-      initTimerThread(this.tickLength);
-    }
-  }
-
-  public TopDocs optimize(BooleanQuery original,
-                          Searcher searcher, int numHits,
-                          String sortField, boolean reverse)
-    throws IOException {
-
-    BooleanQuery query = new BooleanQuery();
-    BooleanQuery cacheQuery = new BooleanQuery();
-    BooleanQuery filterQuery = new BooleanQuery();
-    ArrayList<Filter> filters = new ArrayList<Filter>();
-
-    BooleanClause[] clauses = original.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      BooleanClause c = clauses[i];
-      if (c.isRequired()                          // required
-          && c.getQuery().getBoost() == 0.0f) {   // boost is zero
-
-        if (c.getQuery() instanceof TermQuery     // TermQuery
-            && (searcher.docFreq(((TermQuery)c.getQuery()).getTerm())
-                / (float)searcher.maxDoc()) < threshold) { // beneath threshold
-          query.add(c);                           // don't filterize
-          continue;
-        }
-          
-        if (c.getQuery() instanceof TermRangeQuery) { // RangeQuery
-          TermRangeQuery range = (TermRangeQuery)c.getQuery();
-          filters.add(new TermRangeFilter(range.getField(), 
-              range.getLowerTerm(), range.getUpperTerm(), 
-              range.includesLower(), range.includesUpper()));
-          cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST); // cache it
-          continue;
-        }
-
-        // all other query types
-        filterQuery.add(c.getQuery(), BooleanClause.Occur.MUST);  // filter it
-        cacheQuery.add(c.getQuery(), BooleanClause.Occur.MUST);   // cache it
-        continue;
-      }
-
-      query.add(c);                               // query it
-    }
-
-    Filter filter = null;
-    if (cacheQuery.getClauses().length != 0) {
-      synchronized (cache) {                      // check cache
-        filter = cache.get(cacheQuery);
-      }
-      if (filter == null) {                       // miss
-
-        if (filterQuery.getClauses().length != 0) // add filterQuery to filters
-          filters.add(new CachingWrapperFilter(new QueryWrapperFilter(filterQuery)));
-
-        if (filters.size() == 1) {                // convert filters to filter
-          filter = (Filter)filters.get(0);
-        } else {
-          filter = new ChainedFilter((Filter[])filters.toArray
-                                     (new Filter[filters.size()]),
-                                     ChainedFilter.AND);
-        }
-        if (!(filter instanceof CachingWrapperFilter))     // make sure bits are cached
-          filter = new CachingWrapperFilter(filter);
-        
-        synchronized (cache) {
-          cache.put(cacheQuery, filter);          // cache the filter
-        }
-      }        
-    }
-    if (sortField == null && !reverse) {
-
-      // no hit limit
-      if (this.searcherMaxHits <= 0 && timerThread == null)  {
-        return searcher.search(query, filter, numHits);
-      }
-
-      // hits limited in time or in count -- use a LimitedCollector
-      LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits,
-              maxTickCount, timerThread);
-      LimitExceeded exceeded = null;
-      TimeExceeded timeExceeded = null;
-      try {
-        searcher.search(query, filter, collector);
-      } catch (LimitExceeded le) {
-        exceeded = le;
-      } catch (TimeExceeded te) {
-        timeExceeded = te;
-      }
-      TopDocs results = collector.topDocs();
-      if (exceeded != null) {                     // limit was exceeded
-        results.totalHits = (int)                 // must estimate totalHits
-          (results.totalHits*(searcher.maxDoc()/(float)exceeded.maxDoc));
-      } else if (timeExceeded != null) {
-        // Estimate total hits.
-        results.totalHits = (int)(results.totalHits * (searcher.maxDoc()/(float)timeExceeded.maxDoc));
-      }
-      return results;
-
-    } else {
-      return searcher.search(query, filter, numHits,
-                             new Sort(new SortField(sortField, SortField.STRING, reverse)));
-    }
-  }
-}
Index: src/java/org/apache/nutch/searcher/Query.java
===================================================================
--- src/java/org/apache/nutch/searcher/Query.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Query.java	(working copy)
@@ -1,502 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.util.Arrays;
-import java.util.ArrayList;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Writable;
-import org.apache.nutch.analysis.AnalyzerFactory;
-
-import org.apache.nutch.analysis.NutchAnalysis;
-import org.apache.nutch.util.NutchConfiguration;
-
-/** A Nutch query. */
-public final class Query implements Writable, Cloneable, Configurable {
-  public static final Log LOG = LogFactory.getLog(Query.class);
-
-  private QueryParams params = new QueryParams();
-  
-  public void setParams(QueryParams context) {
-    this.params = context;
-  }
-
-  public QueryParams getParams() {
-    return params;
-  }
-
-  /** A query clause. */
-  public static class Clause implements Cloneable {
-    public static final String DEFAULT_FIELD = "DEFAULT";
-
-    private static final byte REQUIRED_BIT = 1;
-    private static final byte PROHIBITED_BIT = 2;
-    private static final byte PHRASE_BIT = 4;
-
-    private boolean isRequired;
-    private boolean isProhibited;
-    private String field = DEFAULT_FIELD;
-    private float weight = 1.0f;
-    private Object termOrPhrase;
-
-    private Configuration conf; 
-
-    public Clause(Term term, String field,
-                  boolean isRequired, boolean isProhibited, Configuration conf) {
-      this(term, isRequired, isProhibited, conf);
-      this.field = field;
-    }
-
-    public Clause(Term term, boolean isRequired, boolean isProhibited, Configuration conf) {
-      this.isRequired = isRequired;
-      this.isProhibited = isProhibited;
-      this.termOrPhrase = term;
-      this.conf = conf;
-    }
-
-    public Clause(Phrase phrase, String field,
-                  boolean isRequired, boolean isProhibited, Configuration conf) {
-      this(phrase, isRequired, isProhibited, conf);
-      this.field = field;
-    }
-
-    public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, Configuration conf) {
-      this.isRequired = isRequired;
-      this.isProhibited = isProhibited;
-      this.termOrPhrase = phrase;
-      this.conf = conf;
-    }
-
-    public boolean isRequired() { return isRequired; }
-    public boolean isProhibited() { return isProhibited; }
-
-    public String getField() { return field; }
-
-    public float getWeight() { return weight; }
-    public void setWeight(float weight) {  this.weight = weight; }
-
-    public boolean isPhrase() { return termOrPhrase instanceof Phrase; }
-
-    public Phrase getPhrase() { return (Phrase)termOrPhrase; }
-    public Term getTerm() { return (Term)termOrPhrase; }
-
-    public void write(DataOutput out) throws IOException {
-      byte bits = 0;
-      if (isPhrase())
-        bits |= PHRASE_BIT;
-      if (isRequired)
-        bits |= REQUIRED_BIT;
-      if (isProhibited)
-        bits |= PROHIBITED_BIT;
-      out.writeByte(bits);
-      out.writeUTF(field);
-      out.writeFloat(weight);
-      
-      if (isPhrase())
-        getPhrase().write(out);
-      else
-        getTerm().write(out);
-    }
-
-    public static Clause read(DataInput in, Configuration conf) throws IOException {
-      byte bits = in.readByte();
-      boolean required = ((bits & REQUIRED_BIT) != 0);
-      boolean prohibited = ((bits & PROHIBITED_BIT) != 0);
-
-      String field = in.readUTF();
-      float weight = in.readFloat();
-
-      Clause clause;
-      if ((bits & PHRASE_BIT) == 0) {
-        clause = new Clause(Term.read(in), field, required, prohibited, conf);
-      } else {
-        clause = new Clause(Phrase.read(in), field, required, prohibited, conf);
-      }
-      clause.weight = weight;
-      return clause;
-    }
-
-    public String toString() {
-      StringBuffer buffer = new StringBuffer();
-//       if (isRequired)
-//         buffer.append("+");
-//       else
-      if (isProhibited)
-        buffer.append ("-");
-
-      if (!DEFAULT_FIELD.equals(field)) {
-        buffer.append(field);
-        buffer.append(":");
-      }
-
-      if (!isPhrase() && new QueryFilters(conf).isRawField(field)) {
-        buffer.append('"');                        // quote raw terms
-        buffer.append(termOrPhrase.toString());
-        buffer.append('"');
-      } else {
-        buffer.append(termOrPhrase.toString());
-      }
-
-      return buffer.toString();
-    }
-
-    public boolean equals(Object o) {
-      if (!(o instanceof Clause)) return false;
-      Clause other = (Clause)o;
-      return
-        (this.isRequired == other.isRequired) &&
-        (this.isProhibited == other.isProhibited) &&
-        (this.weight == other.weight) &&
-        (this.termOrPhrase == null ? other.termOrPhrase == null :
-         this.termOrPhrase.equals(other.termOrPhrase));
-    }
-        
-    public int hashCode() {
-      return
-        (this.isRequired ? 0 : 1) ^
-        (this.isProhibited ? 2 : 4) ^
-        Float.floatToIntBits(this.weight) ^
-        (this.termOrPhrase != null ? termOrPhrase.hashCode() : 0);
-    }
-    
-    public Object clone() {
-      try {
-        return super.clone();
-      } catch (CloneNotSupportedException e) {
-        throw new RuntimeException(e);
-      }
-    }
-  }
-
-  /** A single-term query clause. */
-  public static class Term {
-    private String text;
-
-    public Term(String text) {
-      this.text = text;
-    }
-
-    public void write(DataOutput out) throws IOException {
-      out.writeUTF(text);
-    }
-
-    public static Term read(DataInput in) throws IOException {
-      String text = in.readUTF();
-      return new Term(text);
-    }
-
-    public String toString() {
-      return text;
-    }
-
-    public boolean equals(Object o) {
-      if (!(o instanceof Term)) return false;
-      Term other = (Term)o;
-      return text == null ? other.text == null : text.equals(other.text);
-    }
-
-    public int hashCode() {
-      return text != null ? text.hashCode() : 0;
-    }
-  }
-
-  /** A phrase query clause. */
-  public static class Phrase {
-    private Term[] terms;
-
-    public Phrase(Term[] terms) {
-      this.terms = terms;
-    }
-
-    public Phrase(String[] terms) {
-      this.terms = new Term[terms.length];
-      for (int i = 0; i < terms.length; i++) {
-        this.terms[i] = new Term(terms[i]);
-      }
-    }
-
-    public Term[] getTerms() { return terms; }
-
-    public void write(DataOutput out) throws IOException {
-      out.writeByte(terms.length);
-      for (int i = 0; i < terms.length; i++)
-        terms[i].write(out);
-    }
-
-    public static Phrase read(DataInput in) throws IOException {
-      int length = in.readByte();
-      Term[] terms = new Term[length];
-      for (int i = 0; i < length; i++)
-        terms[i] = Term.read(in);
-      return new Phrase(terms);
-    }
-
-    public String toString() {
-      StringBuffer buffer = new StringBuffer();
-      buffer.append("\"");
-      for (int i = 0; i < terms.length; i++) {
-        buffer.append(terms[i].toString());
-        if (i != terms.length-1)
-          buffer.append(" ");
-      }
-      buffer.append("\"");
-      return buffer.toString();
-    }
-
-    public boolean equals(Object o) {
-      if (!(o instanceof Phrase)) return false;
-      Phrase other = (Phrase)o;
-      if (!(this.terms.length == this.terms.length))
-        return false;
-      for (int i = 0; i < terms.length; i++) {
-        if (!this.terms[i].equals(other.terms[i]))
-          return false;
-      }
-      return true;
-    }
-
-    public int hashCode() {
-      int hashCode = terms.length;
-      for (int i = 0; i < terms.length; i++) {
-        hashCode ^= terms[i].hashCode();
-      }
-      return hashCode;
-    }
-
-  }
-
-
-  private ArrayList<Clause> clauses = new ArrayList<Clause>();
-
-  private Configuration conf;
-
-  private static final Clause[] CLAUSES_PROTO = new Clause[0];
-  
-  public Query() {
-  }
-  
-  public Query(Configuration conf) {
-      this.conf = conf;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-  
-  public Configuration getConf() {
-    return conf;
-  }
-  
-  /** Return all clauses. */
-  public Clause[] getClauses() {
-    return clauses.toArray(CLAUSES_PROTO);
-  }
-
-  /** Add a required term in the default field. */
-  public void addRequiredTerm(String term) {
-    addRequiredTerm(term, Clause.DEFAULT_FIELD);
-  }
-
-  /** Add a required term in a specified field. */
-  public void addRequiredTerm(String term, String field) {
-    clauses.add(new Clause(new Term(term), field, true, false, this.conf));
-  }
-
-  /** Add a prohibited term in the default field. */
-  public void addProhibitedTerm(String term) {
-    addProhibitedTerm(term, Clause.DEFAULT_FIELD);
-  }
-
-  /** Add a prohibited term in the specified field. */
-  public void addProhibitedTerm(String term, String field) {
-    clauses.add(new Clause(new Term(term), field, false, true, this.conf));
-  }
-
-  /** Add a required phrase in the default field. */
-  public void addRequiredPhrase(String[] terms) {
-    addRequiredPhrase(terms, Clause.DEFAULT_FIELD);
-  }
-
-  /** Add a required phrase in the specified field. */
-  public void addRequiredPhrase(String[] terms, String field) {
-    if (terms.length == 0) {                      // ignore empty phrase
-    } else if (terms.length == 1) {
-      addRequiredTerm(terms[0], field);           // optimize to term query
-    } else {
-      clauses.add(new Clause(new Phrase(terms), field, true, false, this.conf));
-    }
-  }
-
-  /** Add a prohibited phrase in the default field. */
-  public void addProhibitedPhrase(String[] terms) {
-    addProhibitedPhrase(terms, Clause.DEFAULT_FIELD);
-  }
-
-  /** Add a prohibited phrase in the specified field. */
-  public void addProhibitedPhrase(String[] terms, String field) {
-    if (terms.length == 0) {                      // ignore empty phrase
-    } else if (terms.length == 1) {
-      addProhibitedTerm(terms[0], field);         // optimize to term query
-    } else {
-      clauses.add(new Clause(new Phrase(terms), field, false, true, this.conf));
-    }
-  }
-
-  public void write(DataOutput out) throws IOException {
-    out.writeByte(clauses.size());
-    for (int i = 0; i < clauses.size(); i++)
-      clauses.get(i).write(out);
-    params.write(out);
-  }
-  
-  public static Query read(DataInput in, Configuration conf) throws IOException {
-    Query result = new Query(conf);
-    result.readFields(in);
-    return result;
-  }
-
-  public void readFields(DataInput in) throws IOException {
-    clauses.clear();
-    int length = in.readByte();
-    for (int i = 0; i < length; i++)
-      clauses.add(Clause.read(in, this.conf));
-    
-    params.readFields(in);
-  }
-
-  public String toString() {
-    StringBuffer buffer = new StringBuffer();
-    for (int i = 0; i < clauses.size(); i++) {
-      buffer.append(clauses.get(i).toString());
-      if (i != clauses.size()-1)
-        buffer.append(" ");
-    }
-    return buffer.toString();
-  }
-
-  public boolean equals(Object o) {
-    if (!(o instanceof Query)) return false;
-    Query other = (Query)o;
-    return this.clauses.equals(other.clauses) && this.params.equals(other.params);
-  }
-  
-  public int hashCode() {
-    return this.clauses.hashCode();
-  }
-
-  public Object clone() {
-    Query clone = null;
-    try {
-      clone = (Query)super.clone();
-    } catch (CloneNotSupportedException e) {
-      throw new RuntimeException(e);
-    }
-    clone.clauses = (ArrayList<Clause>)clauses.clone();
-    return clone;
-  }
-
-
-  /** Flattens a query into the set of text terms that it contains.  These are
-   * terms which should be higlighted in matching documents. */
-  public String[] getTerms() {
-    ArrayList<String> result = new ArrayList<String>();
-    for (int i = 0; i < clauses.size(); i++) {
-      Clause clause = clauses.get(i);
-      if (!clause.isProhibited()) {
-        if (clause.isPhrase()) {
-          Term[] terms = clause.getPhrase().getTerms();
-          for (int j = 0; j < terms.length; j++) {
-            result.add(terms[j].toString());
-          }
-        } else {
-          result.add(clause.getTerm().toString());
-        }
-      }
-    }
-    return result.toArray(new String[result.size()]);
-  }
-
-  /**
-   * Parse a query from a string using a language specific analyzer.
-   *
-   * @param queryString is the raw query string to parse
-   * @param queryLang is a two-letters language code used to identify which
-   *        {@link org.apache.nutch.analysis.NutchAnalyzer} should be used
-   *        to parse the query string.
-   * @see org.apache.nutch.analysis.AnalyzerFactory
-   */
-  public static Query parse(String queryString, String queryLang, Configuration conf)
-  throws IOException {
-    return fixup(NutchAnalysis.parseQuery(
-            queryString, AnalyzerFactory.get(conf).get(queryLang), conf), conf);
-  }
-
-  /** Parse a query from a string. */
-  public static Query parse(String queryString, Configuration conf) throws IOException {
-    return parse(queryString, null, conf);
-  }
-
-  /** Convert clauses in unknown fields to the default field. */
-  private static Query fixup(Query input, Configuration conf) {
-    // walk the query
-    Query output = new Query(conf);
-    Clause[] clauses = input.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      Clause c = clauses[i];
-      if (!new QueryFilters(conf).isField(c.getField())) {  // unknown field
-        ArrayList<Term> terms = new ArrayList<Term>();        // add name to query
-        if (c.isPhrase()) {                       
-          terms.addAll(Arrays.asList(c.getPhrase().getTerms()));
-        } else {
-          terms.add(c.getTerm());
-        }
-        terms.add(0, new Term(c.getField()));     // add to front of phrase
-        c = (Clause)c.clone();
-        c.field = Clause.DEFAULT_FIELD;           // use default field instead
-        c.termOrPhrase
-          = new Phrase(terms.toArray(new Term[terms.size()]));
-      }
-      output.clauses.add(c);                    // copy clause to output
-    }
-    return output;
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-    Configuration conf = NutchConfiguration.create();
-    while (true) {
-      System.out.print("Query: ");
-      String line = in.readLine();
-      Query query = parse(line, conf);
-      System.out.println("Parsed: " + query);
-      System.out.println("Translated: " + new QueryFilters(conf).filter(query));
-    }
-  }
-}
Index: src/java/org/apache/nutch/searcher/QueryParams.java
===================================================================
--- src/java/org/apache/nutch/searcher/QueryParams.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/QueryParams.java	(working copy)
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.nutch.metadata.Metadata;
-
-/**
- * Query context object that describes the context of the query.
- */
-public class QueryParams implements Writable {
-
-  public static final String DEFAULT_DEDUP_FIELD = "site";
-  public static final int DEFAULT_MAX_HITS_PER_DUP = 2;
-  public static final int DEFAULT_NUM_HITS = 10;
-  public static final boolean DEFAULT_REVERSE = false;
-
-  private Metadata metadata = new Metadata();
-
-  public void setNumHits(int numHits) {
-    this.numHits = numHits;
-  }
-
-  public void setMaxHitsPerDup(int maxHitsPerDup) {
-    this.maxHitsPerDup = maxHitsPerDup;
-  }
-
-  public void setDedupField(String dedupField) {
-    this.dedupField = dedupField;
-  }
-
-  public void setSortField(String sortField) {
-    this.sortField = sortField;
-  }
-
-  public void setReverse(boolean reverse) {
-    this.reverse = reverse;
-  }
-
-  private int numHits;
-  private int maxHitsPerDup;
-  private String dedupField;
-  private String sortField;
-  private boolean reverse;
-
-  public QueryParams() {
-    setNumHits(DEFAULT_NUM_HITS);
-    setMaxHitsPerDup(DEFAULT_MAX_HITS_PER_DUP);
-    setDedupField(DEFAULT_DEDUP_FIELD);
-    setSortField(sortField);
-    setReverse(false);
-
-  }
-
-  public QueryParams(int numHits, int maxHitsPerDup, String dedupField,
-      String sortField, boolean reverse) {
-    initFrom(numHits, maxHitsPerDup, dedupField, sortField, reverse);
-  }
-
-  public void initFrom(int numHits, int maxHitsPerDup, String dedupField,
-      String sortField, boolean reverse) {
-    setNumHits(numHits);
-    setMaxHitsPerDup(maxHitsPerDup);
-    setDedupField(dedupField);
-    setSortField(sortField);
-    setReverse(reverse);
-  }
-
-  public int getNumHits() {
-    return numHits;
-  }
-
-  public int getMaxHitsPerDup() {
-    return maxHitsPerDup;
-  }
-
-  public String getDedupField() {
-    return dedupField;
-  }
-
-  public String getSortField() {
-    return sortField;
-  }
-
-  public boolean isReverse() {
-    return reverse;
-  }
-
-  public String get(String name) {
-    return metadata.get(name);
-  }
-
-  public void put(String name, String value) {
-    metadata.set(name, value);
-  }
-
-  @Override
-  public void readFields(DataInput input) throws IOException {
-    metadata.readFields(input);
-    numHits = WritableUtils.readVInt(input);
-    maxHitsPerDup = WritableUtils.readVInt(input);
-    dedupField = WritableUtils.readString(input);
-    sortField = WritableUtils.readString(input);
-    reverse = input.readBoolean();
-  }
-
-  @Override
-  public void write(DataOutput output) throws IOException {
-    metadata.write(output);
-    WritableUtils.writeVInt(output, numHits);
-    WritableUtils.writeVInt(output, maxHitsPerDup);
-    WritableUtils.writeString(output, dedupField);
-    WritableUtils.writeString(output, sortField);
-    output.writeBoolean(reverse);
-  }
-
-  @Override
-  public boolean equals(Object obj) {
-    if (obj instanceof QueryParams) {
-
-      QueryParams other = (QueryParams) obj;
-      return other.numHits == this.numHits
-          && other.metadata.equals(this.metadata)
-          && other.reverse == this.reverse
-          && other.maxHitsPerDup == this.maxHitsPerDup
-          && ((other.dedupField != null && other.dedupField
-              .equals(this.dedupField)) || (other.dedupField == null && this.dedupField == null))
-          && ((other.sortField != null && other.sortField
-              .equals(this.sortField)) || (other.sortField == null && this.sortField == null));
-
-    } else {
-      return false;
-    }
-  }
-}
Index: src/java/org/apache/nutch/searcher/QueryFilter.java
===================================================================
--- src/java/org/apache/nutch/searcher/QueryFilter.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/QueryFilter.java	(working copy)
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-// Lucene imports
-import org.apache.lucene.search.BooleanQuery;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configurable;
-
-// Nutch imports
-import org.apache.nutch.plugin.Pluggable;
-
-
-/** Extension point for query translation.  Permits one to add metadata to a
- * query.  All plugins found which implement this extension point are run
- * sequentially on the query.
- */
-public interface QueryFilter extends Pluggable, Configurable {
-  /** The name of the extension point. */
-  final static String X_POINT_ID = QueryFilter.class.getName();
-
-  /** Adds clauses or otherwise modifies the BooleanQuery that will be
-   * searched. */
-  BooleanQuery filter(Query input, BooleanQuery translation)
-    throws QueryException;
-}
Index: src/java/org/apache/nutch/searcher/response/SearchResults.java
===================================================================
--- src/java/org/apache/nutch/searcher/response/SearchResults.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/response/SearchResults.java	(working copy)
@@ -1,156 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response;
-
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Summary;
-
-public class SearchResults {
-
-  private String[] fields;
-  private String responseType;
-  private String query;
-  private String lang;
-  private String sort;
-  private boolean reverse;
-  private boolean withSummary = true;
-  private int start;
-  private int rows;
-  private int end;
-  private long totalHits;
-  private Hit[] hits;
-  private HitDetails[] details;
-  private Summary[] summaries;
-
-  public SearchResults() {
-
-  }
-
-  public String[] getFields() {
-    return fields;
-  }
-
-  public void setFields(String[] fields) {
-    this.fields = fields;
-  }
-
-  public boolean isWithSummary() {
-    return withSummary;
-  }
-
-  public void setWithSummary(boolean withSummary) {
-    this.withSummary = withSummary;
-  }
-
-  public String getResponseType() {
-    return responseType;
-  }
-
-  public void setResponseType(String responseType) {
-    this.responseType = responseType;
-  }
-
-  public String getQuery() {
-    return query;
-  }
-
-  public void setQuery(String query) {
-    this.query = query;
-  }
-
-  public String getLang() {
-    return lang;
-  }
-
-  public void setLang(String lang) {
-    this.lang = lang;
-  }
-
-  public String getSort() {
-    return sort;
-  }
-
-  public void setSort(String sort) {
-    this.sort = sort;
-  }
-
-  public boolean isReverse() {
-    return reverse;
-  }
-
-  public void setReverse(boolean reverse) {
-    this.reverse = reverse;
-  }
-
-  public int getStart() {
-    return start;
-  }
-
-  public void setStart(int start) {
-    this.start = start;
-  }
-
-  public int getRows() {
-    return rows;
-  }
-
-  public void setRows(int rows) {
-    this.rows = rows;
-  }
-
-  public int getEnd() {
-    return end;
-  }
-
-  public void setEnd(int end) {
-    this.end = end;
-  }
-
-  public long getTotalHits() {
-    return totalHits;
-  }
-
-  public void setTotalHits(long totalHits) {
-    this.totalHits = totalHits;
-  }
-
-  public Hit[] getHits() {
-    return hits;
-  }
-
-  public void setHits(Hit[] hits) {
-    this.hits = hits;
-  }
-
-  public HitDetails[] getDetails() {
-    return details;
-  }
-
-  public void setDetails(HitDetails[] details) {
-    this.details = details;
-  }
-
-  public Summary[] getSummaries() {
-    return summaries;
-  }
-
-  public void setSummaries(Summary[] summaries) {
-    this.summaries = summaries;
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/response/RequestUtils.java
===================================================================
--- src/java/org/apache/nutch/searcher/response/RequestUtils.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/response/RequestUtils.java	(working copy)
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response;
-
-import javax.servlet.http.HttpServletRequest;
-
-import org.apache.commons.lang.StringUtils;
-
-/**
- * A set of utility methods for getting request paramters.
- */
-public class RequestUtils {
-
-  public static boolean parameterExists(HttpServletRequest request, String param) {
-    String value = request.getParameter(param);
-    return value != null;
-  }
-
-  public static Integer getIntegerParameter(HttpServletRequest request,
-    String param) {
-    if (parameterExists(request, param)) {
-      String value = request.getParameter(param);
-      if (StringUtils.isNotBlank(value) && StringUtils.isNumeric(value)) {
-        return new Integer(value);
-      }
-    }
-    return null;
-  }
-
-  public static Integer getIntegerParameter(HttpServletRequest request,
-    String param, Integer def) {
-    Integer value = getIntegerParameter(request, param);
-    return (value == null) ? def : value;
-  }
-
-  public static String getStringParameter(HttpServletRequest request,
-    String param) {
-    if (parameterExists(request, param)) {
-      return request.getParameter(param);
-    }
-    return null;
-  }
-
-  public static String getStringParameter(HttpServletRequest request,
-    String param, String def) {
-    String value = getStringParameter(request, param);
-    return (value == null) ? def : value;
-  }
-
-  public static Boolean getBooleanParameter(HttpServletRequest request,
-    String param) {
-    if (parameterExists(request, param)) {
-      String value = request.getParameter(param);
-      if (StringUtils.isNotBlank(value)
-        && (StringUtils.equals(value, "1")
-          || StringUtils.equalsIgnoreCase(value, "true") || StringUtils.equalsIgnoreCase(
-          value, "yes"))) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  public static Boolean getBooleanParameter(HttpServletRequest request,
-    String param, Boolean def) {
-    if (parameterExists(request, param)) {
-      String value = request.getParameter(param);
-      return (StringUtils.isNotBlank(value) && (StringUtils.equals(value, "1")
-        || StringUtils.equalsIgnoreCase(value, "true") || StringUtils.equalsIgnoreCase(
-        value, "yes")));
-    }
-    return def;
-  }
-}
Index: src/java/org/apache/nutch/searcher/response/SearchServlet.java
===================================================================
--- src/java/org/apache/nutch/searcher/response/SearchServlet.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/response/SearchServlet.java	(working copy)
@@ -1,212 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response;
-
-import java.io.IOException;
-
-import javax.servlet.ServletConfig;
-import javax.servlet.ServletException;
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.Summary;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * Servlet that allows returning search results in multiple different formats
- * through a ResponseWriter Nutch extension point.
- * 
- * @see org.apache.nutch.searcher.response.ResponseWriter
- */
-public class SearchServlet
-  extends HttpServlet {
-
-  public static final Log LOG = LogFactory.getLog(SearchServlet.class);
-  private NutchBean bean;
-  private Configuration conf;
-  private ResponseWriters writers;
-
-  private String defaultRespType = "xml";
-  private String defaultLang = null;
-  private int defaultNumRows = 10;
-  private String defaultDedupField = "site";
-  private int defaultNumDupes = 1;
-
-  public static final String RESPONSE_TYPE = "rt";
-  public static final String QUERY = "query";
-  public static final String LANG = "lang";
-  public static final String START = "start";
-  public static final String ROWS = "rows";
-  public static final String SORT = "sort";
-  public static final String REVERSE = "reverse";
-  public static final String DEDUPE = "ddf";
-  public static final String NUM_DUPES = "dupes";
-  public static final String SUMMARY = "summary";
-  public static final String FIELDS = "field";
-
-  /**
-   * Initializes servlet configuration default values.  Gets NutchBean and 
-   * ResponseWriters.
-   */
-  public void init(ServletConfig config)
-    throws ServletException {
-
-    // set sensible defaults for response writer values and cache NutchBean.
-    // Also get and cache all ResponseWriter implementations.
-    super.init(config);
-    try {
-      this.conf = NutchConfiguration.get(config.getServletContext());
-      this.defaultRespType = conf.get("search.response.default.type", "xml");
-      this.defaultLang = conf.get("search.response.default.lang");
-      this.defaultNumRows = conf.getInt("search.response.default.numrows", 10);
-      this.defaultDedupField = conf.get("search.response.default.dedupfield",
-        "site");
-      this.defaultNumDupes = conf.getInt("search.response.default.numdupes", 1);
-      bean = NutchBean.get(config.getServletContext(), this.conf);
-      writers = new ResponseWriters(conf);
-    }
-    catch (IOException e) {
-      throw new ServletException(e);
-    }
-  }
-
-  /**
-   * Forwards all responses to doGet.
-   */
-  protected void doPost(HttpServletRequest request, HttpServletResponse response)
-    throws ServletException, IOException {
-    doGet(request, response);
-  }
-
-  /**
-   * Handles all search requests.  Gets parameter input.  Does the search and 
-   * gets Hits, details, and summaries.  Passes off to ResponseWriter classes
-   * to writer different output formats directly to HttpServletResponse.
-   */
-  protected void doGet(HttpServletRequest request, HttpServletResponse response)
-    throws ServletException, IOException {
-
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("Query request from " + request.getRemoteAddr());
-    }
-
-    // get the response type, used to call the correct ResponseWriter
-    String respType = RequestUtils.getStringParameter(request, RESPONSE_TYPE,
-      defaultRespType);
-    ResponseWriter writer = writers.getResponseWriter(respType);
-    if (writer == null) {
-      throw new IOException("Unknown response type " + respType);
-    }
-
-    // get the query
-    String query = RequestUtils.getStringParameter(request, QUERY);
-    if (StringUtils.isBlank(query)) {
-      throw new IOException("Query cannot be empty!");
-    }
-    
-    // get the language from parameter, then request, then finally configuration
-    String lang = RequestUtils.getStringParameter(request, LANG);
-    if (StringUtils.isBlank(lang)) {
-      lang = request.getLocale().getLanguage();
-      if (StringUtils.isBlank(lang)) {
-        lang = defaultLang;
-      }
-    }
-
-    // get various other search parameters, fields allows only returning a 
-    // given set of fields
-    boolean withSummary = RequestUtils.getBooleanParameter(request, SUMMARY,
-      true);
-    String sort = RequestUtils.getStringParameter(request, SORT);
-    int start = RequestUtils.getIntegerParameter(request, START, 0);
-    int rows = RequestUtils.getIntegerParameter(request, ROWS, defaultNumRows);
-    boolean reverse = RequestUtils.getBooleanParameter(request, REVERSE, false);
-    String dedup = RequestUtils.getStringParameter(request, DEDUPE,
-      defaultDedupField);
-    int numDupes = RequestUtils.getIntegerParameter(request, NUM_DUPES,
-      defaultNumDupes);
-    String[] fields = request.getParameterValues(FIELDS);
-
-    // parse out the query
-    Query queryObj = Query.parse(query, lang, this.conf);
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("query: " + query);
-      NutchBean.LOG.info("lang: " + lang);
-    }
-
-    // search and return hits
-    Hits hits;
-    try {
-      hits = bean.search(queryObj, start + rows, numDupes, dedup, sort, reverse);
-    }
-    catch (IOException e) {
-      if (NutchBean.LOG.isWarnEnabled()) {
-        NutchBean.LOG.warn("Search Error", e);
-      }
-      hits = new Hits(0, new Hit[0]);
-    }
-
-    // get the total number of hits, the hits to show, and the hit details
-    long totalHits = hits.getTotal();
-    int end = (int)Math.min(hits.getLength(), start + rows);
-    int numHits = (end > start) ? (end - start) : 0;
-    Hit[] show = hits.getHits(start, numHits);
-    HitDetails[] details = bean.getDetails(show);
-
-    // setup the SearchResults object, used in response writing
-    SearchResults results = new SearchResults();
-    results.setResponseType(respType);
-    results.setQuery(query);
-    results.setLang(lang);
-    results.setSort(sort);
-    results.setReverse(reverse);
-    results.setStart(start);
-    results.setRows(rows);
-    results.setEnd(end);
-    results.setTotalHits(totalHits);
-    results.setHits(show);
-    results.setDetails(details);
-
-    // are we returning summaries with results, if not avoid network hit
-    if (withSummary) {
-      Summary[] summaries = bean.getSummary(details, queryObj);
-      results.setSummaries(summaries);
-      results.setWithSummary(true);
-    }
-    else {
-      results.setWithSummary(false);
-    }
-
-    // set return fields if any specified, if not all fields are returned
-    if (fields != null && fields.length > 0) {
-      results.setFields(fields);
-    }
-
-    // call the response writer to write out content to HttpResponse directly
-    writer.writeResponse(results, request, response);
-  }
-}
Index: src/java/org/apache/nutch/searcher/response/ResponseWriter.java
===================================================================
--- src/java/org/apache/nutch/searcher/response/ResponseWriter.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/response/ResponseWriter.java	(working copy)
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response;
-
-import java.io.IOException;
-
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import org.apache.hadoop.conf.Configurable;
-import org.apache.nutch.plugin.Pluggable;
-
-/**
- * Nutch extension point which allow writing search results in many different
- * output formats.
- */
-public interface ResponseWriter
-  extends Pluggable, Configurable {
-
-  public final static String X_POINT_ID = ResponseWriter.class.getName();
-  
-  /**
-   * Sets the returned content MIME type.  Populated though variables set in
-   * the plugin.xml file of the ResponseWriter.  This allows easily changing
-   * output content types, for example for JSON from text/plain during tesing
-   * and debugging to application/json in production.
-   * 
-   * @param contentType The MIME content type to set.
-   */
-  public void setContentType(String contentType);
-
-  /**
-   * Writes out the search results response to the HttpServletResponse.
-   * 
-   * @param results The SearchResults object containing hits and other info.
-   * @param request The HttpServletRequest object.
-   * @param response The HttpServletResponse object.
-   * 
-   * @throws IOException If an error occurs while writing out the response.
-   */
-  public void writeResponse(SearchResults results, HttpServletRequest request,
-    HttpServletResponse response)
-    throws IOException;
-
-}
Index: src/java/org/apache/nutch/searcher/response/ResponseWriters.java
===================================================================
--- src/java/org/apache/nutch/searcher/response/ResponseWriters.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/response/ResponseWriters.java	(working copy)
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.ObjectCache;
-
-/**
- * Utility class for getting all ResponseWriter implementations and for
- * returning the correct ResponseWriter for a given request type.
- */
-public class ResponseWriters {
-
-  private Map<String, ResponseWriter> responseWriters;
-
-  /**
-   * Constructor that configures the cache of ResponseWriter objects.
-   * 
-   * @param conf The Nutch configuration object.
-   */
-  public ResponseWriters(Configuration conf) {
-
-    // get the cache and the cache key
-    String cacheKey = ResponseWriter.class.getName();
-    ObjectCache objectCache = ObjectCache.get(conf);
-    this.responseWriters = (Map<String, ResponseWriter>)objectCache.getObject(cacheKey);
-
-    // if already populated do nothing
-    if (this.responseWriters == null) {
-
-      try {
-
-        // get the extension point and all ResponseWriter extensions
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-          ResponseWriter.X_POINT_ID);
-        if (point == null) {
-          throw new RuntimeException(ResponseWriter.X_POINT_ID + " not found.");
-        }
-
-        // populate content type on the ResponseWriter classes, each response
-        // writer can handle more than one response type
-        Extension[] extensions = point.getExtensions();
-        Map<String, ResponseWriter> writers = new HashMap<String, ResponseWriter>();
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          ResponseWriter writer = (ResponseWriter)extension.getExtensionInstance();
-          String[] responseTypes = extension.getAttribute("responseType").split(
-            ",");
-          String contentType = extension.getAttribute("contentType");
-          writer.setContentType(contentType);
-          for (int k = 0; k < responseTypes.length; k++) {
-            writers.put(responseTypes[k], writer);
-          }
-        }
-
-        // set null object if no writers, otherwise set the writers
-        if (writers == null) {
-          objectCache.setObject(cacheKey, new HashMap<String, ResponseWriter>());
-        }
-        else {
-          objectCache.setObject(cacheKey, writers);
-        }
-      }
-      catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-
-      // set the response writers map
-      this.responseWriters = (Map<String, ResponseWriter>)objectCache.getObject(cacheKey);
-    }
-  }
-
-  /**
-   * Return the correct ResponseWriter object for the response type.
-   * 
-   * @param respType The response type, such as xml or json. Must correspond to
-   * the value set in the plugin.xml file for the ResponseWriter extension.
-   * 
-   * @return The ResponseWriter that handles that response type or null if no
-   * such object exists.
-   */
-  public ResponseWriter getResponseWriter(String respType) {
-    return responseWriters.get(respType);
-  }
-}
Index: src/java/org/apache/nutch/searcher/FieldQueryFilter.java
===================================================================
--- src/java/org/apache/nutch/searcher/FieldQueryFilter.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/FieldQueryFilter.java	(working copy)
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.PhraseQuery;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.index.Term;
-
-import org.apache.nutch.analysis.CommonGrams;
-
-import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.searcher.Query.Phrase;
-import org.apache.hadoop.conf.Configuration;
-
-/** Translate query fields to search the same-named field, as indexed by an
- * IndexingFilter.  Best for tokenized fields. */
-public abstract class FieldQueryFilter implements QueryFilter {
-  private String field;
-  private float boost = 1.0f;
-  private Configuration conf;
-  private CommonGrams commonGrams;
-
-  /** Construct for the named field.*/
-  protected FieldQueryFilter(String field) {
-    this(field, 1.0f);
-  }
-
-  /** Construct for the named field, boosting as specified.*/
-  protected FieldQueryFilter(String field, float boost) {
-    this.field = field;
-    this.boost = boost;
-  }
-
-  public BooleanQuery filter(Query input, BooleanQuery output)
-    throws QueryException {
-    
-    // examine each clause in the Nutch query
-    Clause[] clauses = input.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      Clause c = clauses[i];
-
-      // skip non-matching clauses
-      if (!c.getField().equals(field))
-        continue;
-
-      // optimize phrase clause
-      if (c.isPhrase()) {
-        String[] opt = this.commonGrams.optimizePhrase(c.getPhrase(), field);
-        if (opt.length==1) {
-          c = new Clause(new Query.Term(opt[0]),
-                         c.isRequired(), c.isProhibited(), getConf());
-        } else {
-          c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf());
-        }
-      }
-
-      // construct appropriate Lucene clause
-      org.apache.lucene.search.Query luceneClause;
-      if (c.isPhrase()) {
-        Phrase nutchPhrase = c.getPhrase();
-        Query.Term[] terms = nutchPhrase.getTerms();
-        PhraseQuery lucenePhrase = new PhraseQuery();
-        for (int j = 0; j < terms.length; j++) {
-          lucenePhrase.add(new Term(field, terms[j].toString()));
-        }
-        luceneClause = lucenePhrase;
-      } else {
-        luceneClause = new TermQuery(new Term(field, c.getTerm().toString()));
-      }
-
-      // set boost
-      luceneClause.setBoost(boost);
-      // add it as specified in query
-      
-      output.add(luceneClause, 
-          (c.isProhibited()
-              ? BooleanClause.Occur.MUST_NOT
-              : (c.isRequired()
-                  ? BooleanClause.Occur.MUST
-                  : BooleanClause.Occur.SHOULD
-                 )
-           ));
-    }
-    
-    // return the modified Lucene query
-    return output;
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.commonGrams = new CommonGrams(conf);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/java/org/apache/nutch/searcher/QueryException.java
===================================================================
--- src/java/org/apache/nutch/searcher/QueryException.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/QueryException.java	(working copy)
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-@SuppressWarnings("serial")
-public class QueryException extends java.io.IOException {
-  public QueryException(String message) {
-    super(message);
-  }
-}
Index: src/java/org/apache/nutch/searcher/SummarizerFactory.java
===================================================================
--- src/java/org/apache/nutch/searcher/SummarizerFactory.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/SummarizerFactory.java	(working copy)
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.PluginRepository;
-
-
-/**
- * A factory for retrieving {@link Summarizer} extensions.
- * 
- * @author J&eacute;r&ocirc;me Charron
- */
-public class SummarizerFactory {
-
-  /** My logger */
-  public final static Log LOG = LogFactory.getLog(SummarizerFactory.class);
-
-  /** The first available {@link Summarizer} */
-  private Summarizer summarizer = null;
-  
-  
-  public SummarizerFactory(Configuration conf) {
-    try {
-      Extension[] extensions = PluginRepository
-                                    .get(conf)
-                                    .getExtensionPoint(Summarizer.X_POINT_ID)
-                                    .getExtensions();
-      summarizer = (Summarizer) extensions[0].getExtensionInstance();
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Using the first summarizer extension found: " +
-                 extensions[0].getId());
-      }
-    } catch (Exception e) {
-      if (LOG.isWarnEnabled()) { LOG.warn(e.toString()); }
-    }
-  }
-
-  /**
-   * Get the first available {@link Summarizer} extension.
-   * @return the first available {@link Summarizer} extension, or
-   *         <code>null</code> if none available.
-   */
-  public Summarizer getSummarizer() {
-    return summarizer;
-  }
-
-} 
Index: src/java/org/apache/nutch/searcher/LuceneSearchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/LuceneSearchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/LuceneSearchBean.java	(working copy)
@@ -1,129 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.nutch.indexer.Indexer;
-import org.apache.nutch.util.HadoopFSUtil;
-
-public class LuceneSearchBean implements RPCSearchBean {
-
-  public static final long VERSION = 1L;
-
-  private IndexSearcher searcher;
-
-  private FileSystem fs;
-
-  private Configuration conf;
-
-  /**
-   * Construct in a named directory.
-   * @param conf
-   * @param dir
-   * @throws IOException
-   */
-  public LuceneSearchBean(Configuration conf, Path indexDir, Path indexesDir)
-  throws IOException {
-    this.conf = conf;
-    this.fs = FileSystem.get(this.conf);
-    init(indexDir, indexesDir);
-  }
-
-  private void init(Path indexDir, Path indexesDir)
-  throws IOException {
-    Path absIndexDir = indexDir.makeQualified(indexDir.getFileSystem(conf));
-    Path absIndexesDir = indexesDir.makeQualified(indexesDir.getFileSystem(conf));
-    if (this.fs.exists(indexDir)) {
-      LOG.info("opening merged index in " + absIndexDir.toUri());
-      this.searcher = new IndexSearcher(indexDir, this.conf);
-    } else {
-      if (!this.fs.exists(indexesDir)) {
-        // should throw exception ?
-        LOG.warn("Neither " + absIndexDir.toUri() + " nor " +
-                absIndexesDir.toUri() + " found!");
-      } else {
-        LOG.info("opening indexes in " + absIndexesDir.toUri());
-      }
-      List<Path> vDirs = new ArrayList<Path>();
-      FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
-      Path[] directories = HadoopFSUtil.getPaths(fstats);
-      for(int i = 0; i < directories.length; i++) {
-        Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
-        if(fs.isFile(indexdone)) {
-          vDirs.add(directories[i]);
-        }
-      }
-
-      directories = new Path[ vDirs.size() ];
-      for(int i = 0; vDirs.size()>0; i++) {
-        directories[i] = vDirs.remove(0);
-      }
-
-      this.searcher = new IndexSearcher(directories, this.conf);
-    }
-  }
-
-
-  @Override
-  @Deprecated
-  public Hits search(Query query, int numHits, String dedupField,
-                     String sortField, boolean reverse)
-  throws IOException {
-    query.setParams(new QueryParams(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse));
-    return searcher.search(query);
-  }
-  
-  @Override
-  public Hits search(Query query) throws IOException {
-    return searcher.search(query);
-  }
-
-  public String getExplanation(Query query, Hit hit) throws IOException {
-    return searcher.getExplanation(query, hit);
-  }
-
-  public HitDetails getDetails(Hit hit) throws IOException {
-    return searcher.getDetails(hit);
-  }
-
-  public HitDetails[] getDetails(Hit[] hits) throws IOException {
-    return searcher.getDetails(hits);
-  }
-
-  public boolean ping() throws IOException {
-    return true;
-  }
-
-  public void close() throws IOException {
-    if (searcher != null) { searcher.close(); }
-    if (fs != null) { fs.close(); }
-  }
-
-  public long getProtocolVersion(String protocol, long clientVersion)
-  throws IOException {
-    return VERSION;
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/SegmentBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/SegmentBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/SegmentBean.java	(working copy)
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-public interface SegmentBean extends HitContent, HitSummarizer {
-
-  public String[] getSegmentNames() throws IOException;
-}
Index: src/java/org/apache/nutch/searcher/RawFieldQueryFilter.java
===================================================================
--- src/java/org/apache/nutch/searcher/RawFieldQueryFilter.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/RawFieldQueryFilter.java	(working copy)
@@ -1,97 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.index.Term;
-
-import org.apache.nutch.searcher.Query.Clause;
-
-/** Translate raw query fields to search the same-named field, as indexed by an
- * IndexingFilter. */
-public abstract class RawFieldQueryFilter implements QueryFilter {
-  private String field;
-  private boolean lowerCase;
-  private float boost;
-
-  /** Construct for the named field, lowercasing query values.*/
-  protected RawFieldQueryFilter(String field) {
-    this(field, true);
-  }
-
-  /** Construct for the named field, lowercasing query values.*/
-  protected RawFieldQueryFilter(String field, float boost) {
-    this(field, true, boost);
-  }
-
-  /** Construct for the named field, potentially lowercasing query values.*/
-  protected RawFieldQueryFilter(String field, boolean lowerCase) {
-    this(field, lowerCase, 0.0f);
-  }
-
-  /** Construct for the named field, potentially lowercasing query values.*/
-  protected RawFieldQueryFilter(String field, boolean lowerCase, float boost) {
-    this.field = field;
-    this.lowerCase = lowerCase;
-    this.boost = boost;
-  }
-
-  protected void setBoost(float boost) {
-    this.boost = boost;
-  }
-  
-  public BooleanQuery filter(Query input, BooleanQuery output)
-    throws QueryException {
-    
-    // examine each clause in the Nutch query
-    Clause[] clauses = input.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      Clause c = clauses[i];
-
-      // skip non-matching clauses
-      if (!c.getField().equals(field))
-        continue;
-
-      // get the field value from the clause
-      // raw fields are guaranteed to be Terms, not Phrases
-      String value = c.getTerm().toString();
-      if (lowerCase)
-        value = value.toLowerCase();
-
-      // add a Lucene TermQuery for this clause
-      TermQuery clause = new TermQuery(new Term(field, value));
-      // set boost
-      clause.setBoost(boost);
-      // add it as specified in query
-      
-      output.add(clause, 
-          (c.isProhibited()
-              ? BooleanClause.Occur.MUST_NOT
-              : (c.isRequired()
-                  ? BooleanClause.Occur.MUST
-                  : BooleanClause.Occur.SHOULD
-                 )
-           ));
-    }
-    
-    // return the modified Lucene query
-    return output;
-  }
-}
Index: src/java/org/apache/nutch/searcher/HitDetails.java
===================================================================
--- src/java/org/apache/nutch/searcher/HitDetails.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/HitDetails.java	(working copy)
@@ -1,136 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-
-import org.apache.hadoop.io.*;
-import org.apache.nutch.html.Entities;
-
-/** Data stored in the index for a hit.
- *
- * <p>Represented as a list of name/value pairs.
- */
-public final class HitDetails implements Writable {
-
-  private int length;
-  private String[] fields;
-  private String[] values;
-
-  public HitDetails() {}
-
-  /** Construct from field names and values arrays. */
-  public HitDetails(String[] fields, String[] values) {
-    this.length = fields.length;
-    this.fields = fields;
-    this.values = values;
-  }
-
-  /** Construct minimal details from a segment name and document number. */
-  public HitDetails(String segment, String url) {
-    this(new String[2], new String[2]);
-    this.fields[0] = "segment";
-    this.values[0] = segment;
-    this.fields[1] = "url";
-    this.values[1] = url;
-  }
-
-  /** Returns the number of fields contained in this. */
-  public int getLength() { return length; }
-
-  /** Returns the name of the <code>i</code><sup>th</sup> field. */
-  public String getField(int i) { return fields[i]; }
-
-  /** Returns the value of the <code>i</code><sup>th</sup> field. */
-  public String getValue(int i) { return values[i]; }
-  
-  /** Returns the value of the first field with the specified name. */
-  public String getValue(String field) {
-    for (int i = 0; i < length; i++) {
-      if (fields[i].equals(field))
-        return values[i];
-    }
-    return null;
-  }
-
-  /** Returns all the values with the specified name. */
-  public String[] getValues(String field) {
-   ArrayList<String> vals = new ArrayList<String>();
-   for (int i=0; i<length; i++) {
-     if (fields[i].equals(field)) {
-       vals.add(values[i]);
-     }
-   }
-   return (vals.size() > 0)
-            ? vals.toArray(new String[vals.size()])
-            : null;
-}
-
-  // javadoc from Writable
-  public void write(DataOutput out) throws IOException {
-    out.writeInt(length);
-    for (int i = 0; i < length; i++) {
-      out.writeUTF(fields[i]);
-      out.writeUTF(values[i]);
-    }
-  }
-  
-  /** Constructs, reads and returns an instance. */
-  public static HitDetails read(DataInput in) throws IOException {
-    HitDetails result = new HitDetails();
-    result.readFields(in);
-    return result;
-  }
-
-  // javadoc from Writable
-  public void readFields(DataInput in) throws IOException {
-    length = in.readInt();
-    fields = new String[length];
-    values = new String[length];
-    for (int i = 0; i < length; i++) {
-      fields[i] = in.readUTF();
-      values[i] = in.readUTF();
-    }
-  }
-
-  /** Display as a string. */
-  public String toString() {
-    return getValue("segment") + "/" + getValue("url");
-  }
-
-  /** Display as HTML. */
-  public String toHtml() {
-    StringBuffer buffer = new StringBuffer();
-    buffer.append("<ul>\n");
-    for (int i = 0; i < length; i++) {
-      buffer.append("<li>");
-      buffer.append(fields[i]);
-      buffer.append(" = ");
-      buffer.append(Entities.encode(values[i]));
-      buffer.append("</li>\n");
-    }
-    buffer.append("</ul>\n");
-    return buffer.toString();
-  }
-  
-
-
-}
Index: src/java/org/apache/nutch/searcher/LinkDbInlinks.java
===================================================================
--- src/java/org/apache/nutch/searcher/LinkDbInlinks.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/LinkDbInlinks.java	(working copy)
@@ -1,60 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
- * Created on Nov 23, 2005
- * Author: Andrzej Bialecki &lt;ab@getopt.org&gt;
- *
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.crawl.LinkDbReader;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-public class LinkDbInlinks implements HitInlinks {
-  private static final Log LOG = LogFactory.getLog(LinkDbInlinks.class);
-  
-  private LinkDbReader linkdb = null;
-  
-  public LinkDbInlinks(FileSystem fs, Path dir, Configuration conf) {
-    try {
-      linkdb = new LinkDbReader(conf, dir);
-    } catch (Exception e) {
-      LOG.warn("Could not create LinkDbReader: " + e);
-    }
-  }
-
-  public String[] getAnchors(HitDetails details) throws IOException {
-    return linkdb.getAnchors(new Text(details.getValue("url")));
-  }
-
-  public Inlinks getInlinks(HitDetails details) throws IOException {
-    return linkdb.getInlinks(new Text(details.getValue("url")));
-  }
-
-  public void close() throws IOException {
-    if (linkdb != null) { linkdb.close(); }
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/SolrSearchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/SolrSearchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/SolrSearchBean.java	(working copy)
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.util.ToStringUtils;
-import org.apache.nutch.indexer.solr.SolrMappingReader;
-import org.apache.nutch.indexer.solr.SolrWriter;
-import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.SolrQuery.ORDER;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
-
-public class SolrSearchBean implements SearchBean {
-
-  public static final Log LOG = LogFactory.getLog(SolrSearchBean.class);
-
-  private final SolrServer solr;
-
-  private final QueryFilters filters;
-  
-  private String searchUID;
-
-  public SolrSearchBean(Configuration conf, String solrServer)
-  throws IOException {
-    solr = new CommonsHttpSolrServer(solrServer);
-    filters = new QueryFilters(conf);
-    SolrMappingReader mapping = SolrMappingReader.getInstance(conf);
-    searchUID = mapping.getUniqueKey();
-  }
-
-  public String getExplanation(Query query, Hit hit) throws IOException {
-    return "SOLR backend does not support explanations yet.";
-  }
-
-  
-  public Hits search(Query query) throws IOException {
-    // filter query string
-    final BooleanQuery bQuery = filters.filter(query);
-
-    final SolrQuery solrQuery = new SolrQuery(stringify(bQuery));
-
-    solrQuery.setRows(query.getParams().getNumHits());
-
-    if (query.getParams().getSortField() == null) {
-      solrQuery.setFields(query.getParams().getDedupField(), "score", searchUID);
-      query.getParams().setSortField("score");
-    } else {
-      solrQuery.setFields(query.getParams().getDedupField(), query
-          .getParams().getSortField(), searchUID);
-      solrQuery.setSortField(query.getParams().getSortField(), query
-          .getParams().isReverse() ? ORDER.asc : ORDER.desc);
-    }
-
-    QueryResponse response;
-    try {
-      response = solr.query(solrQuery);
-    } catch (final SolrServerException e) {
-      throw SolrWriter.makeIOException(e);
-    }
-
-    final SolrDocumentList docList = response.getResults();
-
-    final Hit[] hitArr = new Hit[docList.size()];
-    for (int i = 0; i < hitArr.length; i++) {
-      final SolrDocument solrDoc = docList.get(i);
-
-      final Object raw = solrDoc.getFirstValue(query.getParams().getSortField());
-      WritableComparable sortValue;
-
-      if (raw instanceof Integer) {
-        sortValue = new IntWritable(((Integer)raw).intValue());
-      } else if (raw instanceof Float) {
-        sortValue = new FloatWritable(((Float)raw).floatValue());
-      } else if (raw instanceof String) {
-        sortValue = new Text((String)raw);
-      } else if (raw instanceof Long) {
-        sortValue = new LongWritable(((Long)raw).longValue());
-      } else {
-        throw new RuntimeException("Unknown sort value type!");
-      }
-
-      final String dedupValue = (String) solrDoc.getFirstValue(query.getParams().getDedupField());
-
-      final String uniqueKey = (String )solrDoc.getFirstValue(searchUID);
-
-      hitArr[i] = new Hit(uniqueKey, sortValue, dedupValue);
-    }
-
-    return new Hits(docList.getNumFound(), hitArr);
-  }
-
-  @SuppressWarnings("unchecked")
-  @Deprecated
-  public Hits search(Query query, int numHits, String dedupField,
-                     String sortField, boolean reverse)
-  throws IOException {
-    query.getParams().setNumHits(numHits); 
-    query.getParams().setDedupField(dedupField); 
-    query.getParams().setSortField(sortField); 
-    query.getParams().setReverse(reverse);
-    return search(query);
-  }
-
-  public HitDetails getDetails(Hit hit) throws IOException {
-    QueryResponse response;
-    try {
-      response = solr.query(new SolrQuery(searchUID + ":\"" + hit.getUniqueKey() + "\""));
-    } catch (final SolrServerException e) {
-      throw SolrWriter.makeIOException(e);
-    }
-
-    final SolrDocumentList docList = response.getResults();
-    if (docList.getNumFound() == 0) {
-      return null;
-    }
-
-    return buildDetails(docList.get(0));
-  }
-
-  public HitDetails[] getDetails(Hit[] hits) throws IOException {
-    final StringBuilder buf = new StringBuilder();
-    buf.append("(");
-    for (final Hit hit : hits) {
-      buf.append(" " + searchUID + ":\"");
-      buf.append(hit.getUniqueKey());
-      buf.append("\"");
-    }
-    buf.append(")");
-
-    QueryResponse response;
-    try {
-      response = solr.query(new SolrQuery(buf.toString()));
-    } catch (final SolrServerException e) {
-      throw SolrWriter.makeIOException(e);
-    }
-
-    final SolrDocumentList docList = response.getResults();
-    if (docList.size() < hits.length) {
-      throw new RuntimeException("Missing hit details! Found: " +
-                                 docList.size() + ", expecting: " +
-                                 hits.length);
-    }
-
-    /* Response returned from SOLR server may be out of
-     * order. So we make sure that nth element of HitDetails[]
-     * is the detail of nth hit.
-     */
-    final Map<String, HitDetails> detailsMap =
-      new HashMap<String, HitDetails>(hits.length);
-    for (final SolrDocument solrDoc : docList) {
-      final HitDetails details = buildDetails(solrDoc);
-      detailsMap.put(details.getValue(searchUID), details);
-    }
-
-    final HitDetails[] detailsArr = new HitDetails[hits.length];
-    for (int i = 0; i < hits.length; i++) {
-      detailsArr[i] = detailsMap.get(hits[i].getUniqueKey());
-    }
-
-    return detailsArr;
-  }
-
-  public boolean ping() throws IOException {
-    try {
-      return solr.ping().getStatus() == 0;
-    } catch (final SolrServerException e) {
-      throw SolrWriter.makeIOException(e);
-    }
-  }
-
-  public void close() throws IOException { }
-
-  private static HitDetails buildDetails(SolrDocument solrDoc) {
-    final List<String> fieldList = new ArrayList<String>();
-    final List<String> valueList = new ArrayList<String>();
-    for (final String field : solrDoc.getFieldNames()) {
-      for (final Object o : solrDoc.getFieldValues(field)) {
-        fieldList.add(field);
-        valueList.add(o.toString());
-      }
-    }
-
-    final String[] fields = fieldList.toArray(new String[fieldList.size()]);
-    final String[] values = valueList.toArray(new String[valueList.size()]);
-    return new HitDetails(fields, values);
-  }
-
-  /* Hackish solution for stringifying queries. Code from BooleanQuery.
-   * This is necessary because a BooleanQuery.toString produces
-   * statements like feed:http://www.google.com which doesn't work, we
-   * need feed:"http://www.google.com".
-   */
-  private static String stringify(BooleanQuery bQuery) {
-    final StringBuilder buffer = new StringBuilder();
-    final boolean needParens=(bQuery.getBoost() != 1.0) ||
-                       (bQuery.getMinimumNumberShouldMatch()>0) ;
-    if (needParens) {
-      buffer.append("(");
-    }
-
-    final BooleanClause[] clauses  = bQuery.getClauses();
-    int i = 0;
-    for (final BooleanClause c : clauses) {
-      if (c.isProhibited())
-        buffer.append("-");
-      else if (c.isRequired())
-        buffer.append("+");
-
-      final org.apache.lucene.search.Query subQuery = c.getQuery();
-      if (subQuery instanceof BooleanQuery) {   // wrap sub-bools in parens
-        buffer.append("(");
-        buffer.append(c.getQuery().toString(""));
-        buffer.append(")");
-      } else if (subQuery instanceof TermQuery) {
-        final Term term = ((TermQuery) subQuery).getTerm();
-        buffer.append(term.field());
-        buffer.append(":\"");
-        buffer.append(term.text());
-        buffer.append("\"");
-      } else {
-        buffer.append(" ");
-        buffer.append(c.getQuery().toString());
-      }
-
-      if (i++ != clauses.length - 1) {
-        buffer.append(" ");
-      }
-    }
-
-    if (needParens) {
-      buffer.append(")");
-    }
-
-    if (bQuery.getMinimumNumberShouldMatch()>0) {
-      buffer.append('~');
-      buffer.append(bQuery.getMinimumNumberShouldMatch());
-    }
-
-    if (bQuery.getBoost() != 1.0f) {
-      buffer.append(ToStringUtils.boost(bQuery.getBoost()));
-    }
-
-    return buffer.toString();
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/SearchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/SearchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/SearchBean.java	(working copy)
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-public interface SearchBean extends Searcher, HitDetailer {
-  public static final Log LOG = LogFactory.getLog(SearchBean.class);
-
-  public boolean ping() throws IOException ;
-}
Index: src/java/org/apache/nutch/searcher/DistributedSegmentBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/DistributedSegmentBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/DistributedSegmentBean.java	(working copy)
@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.ipc.RPC;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseText;
-
-public class DistributedSegmentBean implements SegmentBean {
-
-  private static final ExecutorService executor =
-    Executors.newCachedThreadPool();
-
-  private final ScheduledExecutorService pingService;
-
-  private class DistSummmaryTask implements Callable<Summary[]> {
-    private int id;
-
-    private HitDetails[] details;
-    private Query query;
-
-    public DistSummmaryTask(int id) {
-      this.id = id;
-    }
-
-    public Summary[] call() throws Exception {
-      if (details == null) {
-        return null;
-      }
-      return beans[id].getSummary(details, query);
-    }
-
-    public void setSummaryArgs(HitDetails[] details, Query query) {
-      this.details = details;
-      this.query = query;
-    }
-
-  }
-
-  private class SegmentWorker implements Runnable {
-    private int id;
-
-    public SegmentWorker(int id) {
-      this.id = id;
-    }
-
-    public void run()  {
-      try {
-        String[] segments = beans[id].getSegmentNames();
-        for (String segment : segments) {
-          segmentMap.put(segment, id);
-        }
-      } catch (IOException e) {
-        // remove all segments this bean was serving
-        Iterator<Map.Entry<String, Integer>> i =
-          segmentMap.entrySet().iterator();
-        while (i.hasNext()) {
-          Map.Entry<String, Integer> entry = i.next();
-          int curId = entry.getValue();
-          if (curId == this.id) {
-            i.remove();
-          }
-        }
-      }
-    }
-  }
-
-  private long timeout;
-
-  private SegmentBean[] beans;
-
-  private ConcurrentMap<String, Integer> segmentMap;
-
-  private List<Callable<Summary[]>> summaryTasks;
-
-  private List<SegmentWorker> segmentWorkers;
-
-  public DistributedSegmentBean(Configuration conf, Path serversConfig)
-  throws IOException {
-    this.timeout = conf.getLong("ipc.client.timeout", 60000);
-
-    List<SegmentBean> beanList = new ArrayList<SegmentBean>();
-
-    List<InetSocketAddress> segmentServers =
-        NutchBean.readAddresses(serversConfig, conf);
-
-    for (InetSocketAddress addr : segmentServers) {
-      SegmentBean bean = (RPCSegmentBean) RPC.getProxy(RPCSegmentBean.class,
-          FetchedSegments.VERSION, addr, conf);
-      beanList.add(bean);
-    }
-
-    beans = beanList.toArray(new SegmentBean[beanList.size()]);
-
-    summaryTasks = new ArrayList<Callable<Summary[]>>(beans.length);
-    segmentWorkers = new ArrayList<SegmentWorker>(beans.length);
-
-    for (int i = 0; i < beans.length; i++) {
-      summaryTasks.add(new DistSummmaryTask(i));
-      segmentWorkers.add(new SegmentWorker(i));
-    }
-
-    segmentMap = new ConcurrentHashMap<String, Integer>();
-
-    pingService = Executors.newScheduledThreadPool(beans.length);
-    for (SegmentWorker worker : segmentWorkers) {
-      pingService.scheduleAtFixedRate(worker, 0, 30, TimeUnit.SECONDS);
-    }
-  }
-
-  private SegmentBean getBean(HitDetails details) {
-    return beans[segmentMap.get(details.getValue("segment"))];
-  }
-
-  public String[] getSegmentNames() {
-    return segmentMap.keySet().toArray(new String[segmentMap.size()]);
-  }
-
-  public byte[] getContent(HitDetails details) throws IOException {
-    return getBean(details).getContent(details);
-  }
-
-  public long getFetchDate(HitDetails details) throws IOException {
-    return getBean(details).getFetchDate(details);
-  }
-
-  public ParseData getParseData(HitDetails details) throws IOException {
-    return getBean(details).getParseData(details);
-  }
-
-  public ParseText getParseText(HitDetails details) throws IOException {
-    return getBean(details).getParseText(details);
-  }
-
-  public void close() throws IOException {
-    executor.shutdown();
-    pingService.shutdown();
-    for (SegmentBean bean : beans) {
-      bean.close();
-    }
-  }
-
-  public Summary getSummary(HitDetails details, Query query)
-  throws IOException {
-    return getBean(details).getSummary(details, query);
-  }
-
-  @SuppressWarnings("unchecked")
-  public Summary[] getSummary(HitDetails[] detailsArr, Query query)
-  throws IOException {
-    List<HitDetails>[] detailsList = new ArrayList[summaryTasks.size()];
-    for (int i = 0; i < detailsList.length; i++) {
-      detailsList[i] = new ArrayList<HitDetails>();
-    }
-    for (HitDetails details : detailsArr) {
-      detailsList[segmentMap.get(details.getValue("segment"))].add(details);
-    }
-    for (int i = 0; i < summaryTasks.size(); i++) {
-      DistSummmaryTask task = (DistSummmaryTask)summaryTasks.get(i);
-      if (detailsList[i].size() > 0) {
-        HitDetails[] taskDetails =
-          detailsList[i].toArray(new HitDetails[detailsList[i].size()]);
-        task.setSummaryArgs(taskDetails, query);
-      } else {
-        task.setSummaryArgs(null, null);
-      }
-    }
-
-    List<Future<Summary[]>> summaries;
-    try {
-       summaries =
-         executor.invokeAll(summaryTasks, timeout, TimeUnit.MILLISECONDS);
-    } catch (InterruptedException e) {
-      throw new RuntimeException(e);
-    }
-
-    List<Summary> summaryList = new ArrayList<Summary>();
-    for (Future<Summary[]> f : summaries) {
-      Summary[] summaryArray;
-      try {
-         summaryArray = f.get();
-         if (summaryArray == null) {
-           continue;
-         }
-         for (Summary summary : summaryArray) {
-           summaryList.add(summary);
-         }
-      } catch (Exception e) {
-        if (e.getCause() instanceof IOException) {
-          throw (IOException) e.getCause();
-        }
-        throw new RuntimeException(e);
-      }
-    }
-
-    return summaryList.toArray(new Summary[summaryList.size()]);
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/Searcher.java
===================================================================
--- src/java/org/apache/nutch/searcher/Searcher.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Searcher.java	(working copy)
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.Closeable;
-
-/** Service that searches. */
-public interface Searcher extends Closeable {
-  /**
-   * Return the top-scoring hits for a query.
-   * 
-   * @deprecated since 1.1, use {@link #search(Query)} instead.
-   * */
-  Hits search(Query query, int numHits, String dedupField, String sortField,
-      boolean reverse) throws IOException;
-
-  /**
-   * Return the top-scoring hits for a query.
-   * 
-   * @since 1.1
-   */
-  Hits search(Query query) throws IOException;
-
-  /** Return an HTML-formatted explanation of how a query scored. */
-  String getExplanation(Query query, Hit hit) throws IOException;
-}
Index: src/java/org/apache/nutch/searcher/OpenSearchServlet.java
===================================================================
--- src/java/org/apache/nutch/searcher/OpenSearchServlet.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/OpenSearchServlet.java	(working copy)
@@ -1,335 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.net.URLEncoder;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.Set;
-import java.util.HashSet;
-
-import javax.servlet.ServletException;
-import javax.servlet.ServletConfig;
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import javax.xml.parsers.*;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.w3c.dom.*;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-
-/** Present search results using A9's OpenSearch extensions to RSS, plus a few
- * Nutch-specific extensions. */   
-@SuppressWarnings("serial")
-public class OpenSearchServlet extends HttpServlet {
-  private static final Map<String, String> NS_MAP =
-	  new HashMap<String, String>();
-  private int MAX_HITS_PER_PAGE;
-
-  static {
-    NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/");
-    NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/");
-  }
-
-  private static final Set<String> SKIP_DETAILS = new HashSet<String>();
-  static {
-    SKIP_DETAILS.add("url");                   // redundant with RSS link
-    SKIP_DETAILS.add("title");                 // redundant with RSS title
-  }
-
-  private NutchBean bean;
-  private Configuration conf;
-
-  public void init(ServletConfig config) throws ServletException {
-    try {
-      this.conf = NutchConfiguration.get(config.getServletContext());
-      bean = NutchBean.get(config.getServletContext(), this.conf);
-    } catch (IOException e) {
-      throw new ServletException(e);
-    }
-    MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1);
-  }
-
-  public void doGet(HttpServletRequest request, HttpServletResponse response)
-    throws ServletException, IOException {
-
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("query request from " + request.getRemoteAddr());
-    }
-
-    // get parameters from request
-    request.setCharacterEncoding("UTF-8");
-    String queryString = request.getParameter("query");
-    if (queryString == null)
-      queryString = "";
-    String urlQuery = URLEncoder.encode(queryString, "UTF-8");
-    
-    // the query language
-    String queryLang = request.getParameter("lang");
-    
-    int start = 0;                                // first hit to display
-    String startString = request.getParameter("start");
-    if (startString != null)
-      start = Integer.parseInt(startString);
-    
-    int hitsPerPage = 10;                         // number of hits to display
-    String hitsString = request.getParameter("hitsPerPage");
-    if (hitsString != null)
-      hitsPerPage = Integer.parseInt(hitsString);
-    if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE)
-      hitsPerPage = MAX_HITS_PER_PAGE;
-
-    String sort = request.getParameter("sort");
-    boolean reverse =
-      sort!=null && "true".equals(request.getParameter("reverse"));
-
-    // De-Duplicate handling.  Look for duplicates field and for how many
-    // duplicates per results to return. Default duplicates field is 'site'
-    // and duplicates per results default is '2'.
-    String dedupField = request.getParameter("dedupField");
-    if (dedupField == null || dedupField.length() == 0) {
-        dedupField = "site";
-    }
-    int hitsPerDup = 2;
-    String hitsPerDupString = request.getParameter("hitsPerDup");
-    if (hitsPerDupString != null && hitsPerDupString.length() > 0) {
-        hitsPerDup = Integer.parseInt(hitsPerDupString);
-    } else {
-        // If 'hitsPerSite' present, use that value.
-        String hitsPerSiteString = request.getParameter("hitsPerSite");
-        if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) {
-            hitsPerDup = Integer.parseInt(hitsPerSiteString);
-        }
-    }
-     
-    // Make up query string for use later drawing the 'rss' logo.
-    String params = "&hitsPerPage=" + hitsPerPage +
-        (queryLang == null ? "" : "&lang=" + queryLang) +
-        (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
-        (dedupField == null ? "" : "&dedupField=" + dedupField));
-
-    Query query = Query.parse(queryString, queryLang, this.conf);
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("query: " + queryString);
-      NutchBean.LOG.info("lang: " + queryLang);
-    }
-
-    // execute the query
-    Hits hits;
-    try {
-      hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField,
-          sort, reverse);
-    } catch (IOException e) {
-      if (NutchBean.LOG.isWarnEnabled()) {
-        NutchBean.LOG.warn("Search Error", e);
-      }
-      hits = new Hits(0,new Hit[0]);	
-    }
-
-    if (NutchBean.LOG.isInfoEnabled()) {
-      NutchBean.LOG.info("total hits: " + hits.getTotal());
-    }
-
-    // generate xml results
-    int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
-    int length = end-start;
-
-    Hit[] show = hits.getHits(start, end-start);
-    HitDetails[] details = bean.getDetails(show);
-    Summary[] summaries = bean.getSummary(details, query);
-
-    String requestUrl = request.getRequestURL().toString();
-    String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
-      
-
-    try {
-      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-      factory.setNamespaceAware(true);
-      Document doc = factory.newDocumentBuilder().newDocument();
- 
-      Element rss = addNode(doc, doc, "rss");
-      addAttribute(doc, rss, "version", "2.0");
-      addAttribute(doc, rss, "xmlns:opensearch",
-                   NS_MAP.get("opensearch"));
-      addAttribute(doc, rss, "xmlns:nutch", NS_MAP.get("nutch"));
-
-      Element channel = addNode(doc, rss, "channel");
-    
-      addNode(doc, channel, "title", "Nutch: " + queryString);
-      addNode(doc, channel, "description", "Nutch search results for query: "
-              + queryString);
-      addNode(doc, channel, "link",
-              base+"/search.jsp"
-              +"?query="+urlQuery
-              +"&start="+start
-              +"&hitsPerDup="+hitsPerDup
-              +params);
-
-      addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
-      addNode(doc, channel, "opensearch", "startIndex", ""+start);
-      addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage);
-
-      addNode(doc, channel, "nutch", "query", queryString);
-    
-
-      if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
-          || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){
-        addNode(doc, channel, "nutch", "nextPage", requestUrl
-                +"?query="+urlQuery
-                +"&start="+end
-                +"&hitsPerDup="+hitsPerDup
-                +params);
-      }
-
-      if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
-        addNode(doc, channel, "nutch", "showAllHits", requestUrl
-                +"?query="+urlQuery
-                +"&hitsPerDup="+0
-                +params);
-      }
-
-      for (int i = 0; i < length; i++) {
-        Hit hit = show[i];
-        HitDetails detail = details[i];
-        String title = detail.getValue("title");
-        String url = detail.getValue("url");
-        String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
-      
-        if (title == null || title.equals("")) {   // use url for docs w/o title
-          title = url;
-        }
-        
-        Element item = addNode(doc, channel, "item");
-
-        addNode(doc, item, "title", title);
-        if (summaries[i] != null) {
-          addNode(doc, item, "description", summaries[i].toHtml(false));
-        }
-        addNode(doc, item, "link", url);
-
-        addNode(doc, item, "nutch", "site", hit.getDedupValue());
-
-        addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
-        addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
-                +"&query="+urlQuery+"&lang="+queryLang);
-
-        if (hit.moreFromDupExcluded()) {
-          addNode(doc, item, "nutch", "moreFromSite", requestUrl
-                  +"?query="
-                  +URLEncoder.encode("site:"+hit.getDedupValue()
-                                     +" "+queryString, "UTF-8")
-                  +"&hitsPerSite="+0
-                  +params);
-        }
-
-        for (int j = 0; j < detail.getLength(); j++) { // add all from detail
-          String field = detail.getField(j);
-          if (!SKIP_DETAILS.contains(field))
-            addNode(doc, item, "nutch", field, detail.getValue(j));
-        }
-      }
-
-      // dump DOM tree
-
-      DOMSource source = new DOMSource(doc);
-      TransformerFactory transFactory = TransformerFactory.newInstance();
-      Transformer transformer = transFactory.newTransformer();
-      transformer.setOutputProperty("indent", "yes");
-      StreamResult result = new StreamResult(response.getOutputStream());
-      response.setContentType("text/xml");
-      transformer.transform(source, result);
-
-    } catch (javax.xml.parsers.ParserConfigurationException e) {
-      throw new ServletException(e);
-    } catch (javax.xml.transform.TransformerException e) {
-      throw new ServletException(e);
-    }
-      
-  }
-
-  private static Element addNode(Document doc, Node parent, String name) {
-    Element child = doc.createElement(name);
-    parent.appendChild(child);
-    return child;
-  }
-
-  private static void addNode(Document doc, Node parent,
-                              String name, String text) {
-    Element child = doc.createElement(name);
-    child.appendChild(doc.createTextNode(getLegalXml(text)));
-    parent.appendChild(child);
-  }
-
-  private static void addNode(Document doc, Node parent,
-                              String ns, String name, String text) {
-    Element child = doc.createElementNS(NS_MAP.get(ns), ns+":"+name);
-    child.appendChild(doc.createTextNode(getLegalXml(text)));
-    parent.appendChild(child);
-  }
-
-  private static void addAttribute(Document doc, Element node,
-                                   String name, String value) {
-    Attr attribute = doc.createAttribute(name);
-    attribute.setValue(getLegalXml(value));
-    node.getAttributes().setNamedItem(attribute);
-  }
-
-  /*
-   * Ensure string is legal xml.
-   * @param text String to verify.
-   * @return Passed <code>text</code> or a new string with illegal
-   * characters removed if any found in <code>text</code>.
-   * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
-   */
-  protected static String getLegalXml(final String text) {
-      if (text == null) {
-          return null;
-      }
-      StringBuffer buffer = null;
-      for (int i = 0; i < text.length(); i++) {
-        char c = text.charAt(i);
-        if (!isLegalXml(c)) {
-	  if (buffer == null) {
-              // Start up a buffer.  Copy characters here from now on
-              // now we've found at least one bad character in original.
-	      buffer = new StringBuffer(text.length());
-              buffer.append(text.substring(0, i));
-          }
-        } else {
-           if (buffer != null) {
-             buffer.append(c);
-           }
-        }
-      }
-      return (buffer != null)? buffer.toString(): text;
-  }
- 
-  private static boolean isLegalXml(final char c) {
-    return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
-        || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/QueryFilters.java
===================================================================
--- src/java/org/apache/nutch/searcher/QueryFilters.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/QueryFilters.java	(working copy)
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.nutch.plugin.*;
-import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.ObjectCache;
-import org.apache.hadoop.conf.Configuration;
-
-import java.util.*;
-
-import org.apache.lucene.search.BooleanQuery;
-
-/** Creates and caches {@link QueryFilter} implementing plugins.  QueryFilter
- * implementations should define either the "fields" or "raw-fields" attributes
- * for any fields that they process, otherwise these will be ignored by the
- * query parser.  Raw fields are parsed as a single Query.Term, including
- * internal punctuation, while non-raw fields are parsed containing punctuation
- * are parsed as multi-token Query.Phrase's.
- */
-public class QueryFilters {
-  private static final Log LOG = LogFactory.getLog(QueryFilters.class);
-
-  private QueryFilter[] queryFilters;
-  private HashSet<String> FIELD_NAMES ;
-  private HashSet<String> RAW_FIELD_NAMES;
-
-  private static List<String> parseFieldNames(Extension extension,
-                                           String attribute) {
-    String fields = extension.getAttribute(attribute);
-    if (fields == null) fields = "";
-    return Arrays.asList(fields.split("[,\\s]"));
-  }
-
-  public QueryFilters(Configuration conf) {
-    ObjectCache objectCache = ObjectCache.get(conf);
-    this.queryFilters = (QueryFilter[]) objectCache.getObject(QueryFilter.class
-        .getName());
-    if (this.queryFilters == null) {
-      try {
-        ExtensionPoint point = PluginRepository.get(conf)
-            .getExtensionPoint(QueryFilter.X_POINT_ID);
-        if (point == null)
-          throw new RuntimeException(QueryFilter.X_POINT_ID + " not found.");
-        Extension[] extensions = point.getExtensions();
-        FIELD_NAMES = new HashSet<String>();
-        RAW_FIELD_NAMES = new HashSet<String>();
-        QueryFilter[] filters = new QueryFilter[extensions.length];
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          List<String> fieldNames = parseFieldNames(extension, "fields");
-          List<String> rawFieldNames =
-            parseFieldNames(extension, "raw-fields");
-          if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
-            if (LOG.isWarnEnabled()) {
-              LOG.warn("QueryFilter: " + extension.getId()
-                     + " names no fields.");
-            }
-            continue;
-          }
-          filters[i] = (QueryFilter) extension.getExtensionInstance();
-          FIELD_NAMES.addAll(fieldNames);
-          FIELD_NAMES.addAll(rawFieldNames);
-          objectCache.setObject("FIELD_NAMES", FIELD_NAMES);
-          RAW_FIELD_NAMES.addAll(rawFieldNames);
-          objectCache.setObject("RAW_FIELD_NAMES", RAW_FIELD_NAMES);
-        }
-        objectCache.setObject(QueryFilter.class.getName(), filters);
-      } catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-      this.queryFilters = (QueryFilter[]) objectCache.getObject(QueryFilter.class
-          .getName());
-    } else {
-      // cache already filled
-      FIELD_NAMES = (HashSet<String>) objectCache.getObject("FIELD_NAMES");
-      RAW_FIELD_NAMES = (HashSet<String>) objectCache.getObject("RAW_FIELD_NAMES");
-    }
-  }              
-
-  /** Run all defined filters. */
-  public BooleanQuery filter(Query input) throws QueryException {
-    // first check that all field names are claimed by some plugin
-    Clause[] clauses = input.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      Clause c = clauses[i];
-      if (!isField(c.getField()))
-        throw new QueryException("Not a known field name:"+c.getField());
-    }
-
-    // then run each plugin
-    BooleanQuery output = new BooleanQuery();
-    for (int i = 0; i < this.queryFilters.length; i++) {
-      output = this.queryFilters[i].filter(input, output);
-    }
-    return output;
-  }
-
-  public boolean isField(String name) {
-    return FIELD_NAMES.contains(name);
-  }
-  
-  public boolean isRawField(String name) {
-    return RAW_FIELD_NAMES.contains(name);
-  }
-}
Index: src/java/org/apache/nutch/searcher/HitDetailer.java
===================================================================
--- src/java/org/apache/nutch/searcher/HitDetailer.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/HitDetailer.java	(working copy)
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-/** Service that returns details of a hit within an index. */
-public interface HitDetailer {
-  /** Returns the details for a hit document. */
-  HitDetails getDetails(Hit hit) throws IOException;
-  
-  /** Returns the details for a set of hits.  Hook for parallel IPC calls. */
-  HitDetails[] getDetails(Hit[] hits) throws IOException;
-
-}
Index: src/java/org/apache/nutch/searcher/RPCSearchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/RPCSearchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/RPCSearchBean.java	(working copy)
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import org.apache.hadoop.ipc.VersionedProtocol;
-
-public interface RPCSearchBean extends SearchBean, VersionedProtocol {
-
-}
Index: src/java/org/apache/nutch/searcher/HitSummarizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/HitSummarizer.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/HitSummarizer.java	(working copy)
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-/** Service that builds a summary for a hit on a query. */
-public interface HitSummarizer {
-  
-  /**
-   * Returns a summary for the given hit details.
-   *
-   * @param details the details of the hit to be summarized
-   * @param query  indicates what should be higlighted in the summary text
-   */
-  Summary getSummary(HitDetails details, Query query) throws IOException;
-
-  /**
-   * Returns summaries for a set of details.  Hook for parallel IPC calls.
-   *
-   * @param details the details of hits to be summarized
-   * @param query  indicates what should be higlighted in the summary text
-   */
-  Summary[] getSummary(HitDetails[] details, Query query) throws IOException;
-}
Index: src/java/org/apache/nutch/searcher/NutchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/NutchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/NutchBean.java	(working copy)
@@ -1,471 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.*;
-import java.net.InetSocketAddress;
-import java.util.*;
-
-import javax.servlet.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * One stop shopping for search-related functionality.
- * @version $Id$
- */
-public class NutchBean
-implements SearchBean, RPCSearchBean, SegmentBean, RPCSegmentBean, 
-HitInlinks, Closeable {
-
-  public static final Log LOG = LogFactory.getLog(NutchBean.class);
-  public static final String KEY = "nutchBean";
-
-//  static {
-//    LogFormatter.setShowThreadIDs(true);
-//  }
-
-  private SearchBean searchBean;
-  private SegmentBean segmentBean;
-  private final HitInlinks linkDb;
-
-  /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
-   * don't want to use too many of those. */
-  private static final int MAX_PROHIBITED_TERMS = 20;
-
-  private final Configuration conf;
-
-  private final FileSystem fs;
-
-  /** Returns the cached instance in the servlet context.
-   * @see NutchBeanConstructor*/
-  public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
-    final NutchBean bean = (NutchBean)app.getAttribute(KEY);
-    return bean;
-  }
-
-
-  /**
-   *
-   * @param conf
-   * @throws IOException
-   */
-  public NutchBean(Configuration conf) throws IOException {
-    this(conf, null);
-  }
-
-  /**
-   * Construct in a named directory.
-   *
-   * @param conf
-   * @param dir
-   * @throws IOException
-   */
-  public NutchBean(Configuration conf, Path dir) throws IOException {
-    this.conf = conf;
-    this.fs = FileSystem.get(this.conf);
-    if (dir == null) {
-      dir = new Path(this.conf.get("searcher.dir", "crawl"));
-    }
-    final Path luceneConfig = new Path(dir, "search-servers.txt");
-    final Path solrConfig = new Path(dir, "solr-servers.txt");
-    final Path segmentConfig = new Path(dir, "segment-servers.txt");
-
-    if (fs.exists(luceneConfig) || fs.exists(solrConfig)) {
-      searchBean = new DistributedSearchBean(conf, luceneConfig, solrConfig);
-    } else {
-      final Path indexDir = new Path(dir, "index");
-      final Path indexesDir = new Path(dir, "indexes");
-      searchBean = new LuceneSearchBean(conf, indexDir, indexesDir);
-    }
-
-    if (fs.exists(segmentConfig)) {
-      segmentBean = new DistributedSegmentBean(conf, segmentConfig);
-    } else if (fs.exists(luceneConfig)) {
-      segmentBean = new DistributedSegmentBean(conf, luceneConfig);
-    } else {
-      segmentBean = new FetchedSegments(conf, new Path(dir, "segments"));
-    }
-
-    linkDb = new LinkDbInlinks(fs, new Path(dir, "linkdb"), conf);
-  }
-
-  public static List<InetSocketAddress> readAddresses(Path path,
-      Configuration conf) throws IOException {
-    final List<InetSocketAddress> addrs = new ArrayList<InetSocketAddress>();
-    for (final String line : readConfig(path, conf)) {
-      final StringTokenizer tokens = new StringTokenizer(line);
-      if (tokens.hasMoreTokens()) {
-        final String host = tokens.nextToken();
-        if (tokens.hasMoreTokens()) {
-          final String port = tokens.nextToken();
-          addrs.add(new InetSocketAddress(host, Integer.parseInt(port)));
-        }
-      }
-    }
-    return addrs;
-  }
-
-  public static List<String> readConfig(Path path, Configuration conf)
-  throws IOException {
-    final FileSystem fs = FileSystem.get(conf);
-    final BufferedReader reader =
-      new BufferedReader(new InputStreamReader(fs.open(path)));
-    try {
-      final ArrayList<String> addrs = new ArrayList<String>();
-      String line;
-      while ((line = reader.readLine()) != null) {
-        addrs.add(line);
-      }
-      return addrs;
-    } finally {
-      reader.close();
-    }
-  }
-
-  public String[] getSegmentNames() throws IOException {
-    return segmentBean.getSegmentNames();
-  }
-
-  /**
-   * @deprecated since 1.1, use {@link #search(Query)} instead
-   */
-  public Hits search(Query query, int numHits) throws IOException {
-    return search(query, numHits, null, null, false);
-  }
-
-  /**
-   * @deprecated since 1.1, use {@link #search(Query)} instead
-   */
-  public Hits search(Query query, int numHits,
-                     String dedupField, String sortField, boolean reverse)
-    throws IOException {
-
-    query.getParams().initFrom(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse);
-    return search(query);
-  }
-  
-  @Override
-  public Hits search(Query query) throws IOException {
-    if (query.getParams().getMaxHitsPerDup() <= 0)                      // disable dup checking
-      return searchBean.search(query);
-
-    final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
-    int numHitsRaw = (int)(query.getParams().getNumHits() * rawHitsFactor);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("searching for "+numHitsRaw+" raw hits");
-    }
-    Hits hits = searchBean.search(query);
-    final long total = hits.getTotal();
-    final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
-    final List<Hit> resultList = new ArrayList<Hit>();
-    final Set<Hit> seen = new HashSet<Hit>();
-    final List<String> excludedValues = new ArrayList<String>();
-    boolean totalIsExact = true;
-    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
-      // get the next raw hit
-      if (rawHitNum >= hits.getLength()) {
-        // optimize query by prohibiting more matches on some excluded values
-        final Query optQuery = (Query)query.clone();
-        for (int i = 0; i < excludedValues.size(); i++) {
-          if (i == MAX_PROHIBITED_TERMS)
-            break;
-          optQuery.addProhibitedTerm(excludedValues.get(i),
-                                     query.getParams().getDedupField());
-        }
-        numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
-        if (LOG.isInfoEnabled()) {
-          LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
-        }
-        hits = searchBean.search(optQuery);
-        if (LOG.isInfoEnabled()) {
-          LOG.info("found "+hits.getTotal()+" raw hits");
-        }
-        rawHitNum = -1;
-        continue;
-      }
-
-      final Hit hit = hits.getHit(rawHitNum);
-      if (seen.contains(hit))
-        continue;
-      seen.add(hit);
-
-      // get dup hits for its value
-      final String value = hit.getDedupValue();
-      DupHits dupHits = dupToHits.get(value);
-      if (dupHits == null)
-        dupToHits.put(value, dupHits = new DupHits());
-
-      // does this hit exceed maxHitsPerDup?
-      if (dupHits.size() == query.getParams().getMaxHitsPerDup()) {      // yes -- ignore the hit
-        if (!dupHits.maxSizeExceeded) {
-
-          // mark prior hits with moreFromDupExcluded
-          for (int i = 0; i < dupHits.size(); i++) {
-            dupHits.get(i).setMoreFromDupExcluded(true);
-          }
-          dupHits.maxSizeExceeded = true;
-
-          excludedValues.add(value);              // exclude dup
-        }
-        totalIsExact = false;
-      } else {                                    // no -- collect the hit
-        resultList.add(hit);
-        dupHits.add(hit);
-
-        // are we done?
-        // we need to find one more than asked for, so that we can tell if
-        // there are more hits to be shown
-        if (resultList.size() > query.getParams().getNumHits())
-          break;
-      }
-    }
-
-    final Hits results =
-      new Hits(total,
-               resultList.toArray(new Hit[resultList.size()]));
-    results.setTotalIsExact(totalIsExact);
-    return results;
-  }
-
-  @SuppressWarnings("serial")
-  private class DupHits extends ArrayList<Hit> {
-    private boolean maxSizeExceeded;
-  }
-
-  /** Search for pages matching a query, eliminating excessive hits from the
-   * same site.  Hits after the first <code>maxHitsPerDup</code> from the same
-   * site are removed from results.  The remaining hits have {@link
-   * Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero then all
-   * hits are returned.
-   *
-   * @param query query
-   * @param numHits number of requested hits
-   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
-   * @return Hits the matching hits
-   * @throws IOException
-   * @deprecated since 1.1, use {@link #search(Query)} instead
-   * 
-   */
-  public Hits search(Query query, int numHits, int maxHitsPerDup)
-       throws IOException {
-    return search(query, numHits, maxHitsPerDup, "site", null, false);
-  }
-
-  /** Search for pages matching a query, eliminating excessive hits with
-   * matching values for a named field.  Hits after the first
-   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
-   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
-   * then all hits are returned.
-   *
-   * @param query query
-   * @param numHits number of requested hits
-   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
-   * @param dedupField field name to check for duplicates
-   * @return Hits the matching hits
-   * @throws IOException
-   * @deprecated since 1.1, use {@link #search(Query)} instead
-   */
-  public Hits search(Query query, int numHits,
-                     int maxHitsPerDup, String dedupField)
-       throws IOException {
-    return search(query, numHits, maxHitsPerDup, dedupField, null, false);
-  }
-  /** Search for pages matching a query, eliminating excessive hits with
-   * matching values for a named field.  Hits after the first
-   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
-   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
-   * then all hits are returned.
-   *
-   * @param query query
-   * @param numHits number of requested hits
-   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
-   * @param dedupField field name to check for duplicates
-   * @param sortField Field to sort on (or null if no sorting).
-   * @param reverse True if we are to reverse sort by <code>sortField</code>.
-   * @return Hits the matching hits
-   * @throws IOException
-   * @deprecated since 1.1, use {@link #search(Query)} instead
-   */
-  public Hits search(Query query, int numHits,
-                     int maxHitsPerDup, String dedupField,
-                     String sortField, boolean reverse)
-       throws IOException {
-    query.setParams(new QueryParams(numHits, maxHitsPerDup, dedupField, sortField, reverse));
-    return search(query);
-  }
-
-
-  public String getExplanation(Query query, Hit hit) throws IOException {
-    return searchBean.getExplanation(query, hit);
-  }
-
-  public HitDetails getDetails(Hit hit) throws IOException {
-    return searchBean.getDetails(hit);
-  }
-
-  public HitDetails[] getDetails(Hit[] hits) throws IOException {
-    return searchBean.getDetails(hits);
-  }
-
-  public Summary getSummary(HitDetails hit, Query query) throws IOException {
-    return segmentBean.getSummary(hit, query);
-  }
-
-  public Summary[] getSummary(HitDetails[] hits, Query query)
-    throws IOException {
-    return segmentBean.getSummary(hits, query);
-  }
-
-  public byte[] getContent(HitDetails hit) throws IOException {
-    return segmentBean.getContent(hit);
-  }
-
-  public ParseData getParseData(HitDetails hit) throws IOException {
-    return segmentBean.getParseData(hit);
-  }
-
-  public ParseText getParseText(HitDetails hit) throws IOException {
-    return segmentBean.getParseText(hit);
-  }
-
-  public String[] getAnchors(HitDetails hit) throws IOException {
-    return linkDb.getAnchors(hit);
-  }
-
-  public Inlinks getInlinks(HitDetails hit) throws IOException {
-    return linkDb.getInlinks(hit);
-  }
-
-  public long getFetchDate(HitDetails hit) throws IOException {
-    return segmentBean.getFetchDate(hit);
-  }
-
-  public void close() throws IOException {
-    if (searchBean != null) { searchBean.close(); }
-    if (segmentBean != null) { segmentBean.close(); }
-    if (linkDb != null) { linkDb.close(); }
-    if (fs != null) { fs.close(); }
-  }
-
-  public boolean ping() {
-    return true;
-  }
-
-  /** For debugging. */
-  public static void main(String[] args) throws Exception {
-    final String usage = "NutchBean query [<searcher.dir>]";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-
-    final Configuration conf = NutchConfiguration.create();
-    if (args.length > 1) {
-      conf.set("searcher.dir", args[1]);
-    }
-    final NutchBean bean = new NutchBean(conf);
-    try {
-      final Query query = Query.parse(args[0], conf);
-      query.getParams().setMaxHitsPerDup(0);
-      final Hits hits = bean.search(query);
-      System.out.println("Total hits: " + hits.getTotal());
-      final int length = (int)Math.min(hits.getLength(), 10);
-      final Hit[] show = hits.getHits(0, length);
-      final HitDetails[] details = bean.getDetails(show);
-      final Summary[] summaries = bean.getSummary(details, query);
-
-      for (int i = 0; i < hits.getLength(); i++) {
-        System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
-      }
-    } catch (Throwable t) {
-       LOG.error("Exception occured while executing search: " + t, t);
-       System.exit(1);
-    }
-    System.exit(0);
-  }
-
-  public long getProtocolVersion(String className, long clientVersion)
-  throws IOException {
-    if(RPCSearchBean.class.getName().equals(className) &&
-       searchBean instanceof RPCSearchBean) {
-
-      final RPCSearchBean rpcBean = (RPCSearchBean)searchBean;
-      return rpcBean.getProtocolVersion(className, clientVersion);
-    } else if (RPCSegmentBean.class.getName().equals(className) &&
-               segmentBean instanceof RPCSegmentBean) {
-
-      final RPCSegmentBean rpcBean = (RPCSegmentBean)segmentBean;
-      return rpcBean.getProtocolVersion(className, clientVersion);
-    } else {
-      throw new IOException("Unknown Protocol classname:" + className);
-    }
-  }
-
-  /** Responsible for constructing a NutchBean singleton instance and
-   *  caching it in the servlet context. This class should be registered in
-   *  the deployment descriptor as a listener
-   */
-  public static class NutchBeanConstructor implements ServletContextListener {
-
-    public void contextDestroyed(ServletContextEvent sce) {
-      final ServletContext context = sce.getServletContext();
-
-      LOG.info("Closing Bean");
-      try {
-        Object tmp = context.getAttribute(NutchBean.KEY);
-
-        if (tmp instanceof NutchBean) {
-          NutchBean bean = (NutchBean) tmp;
-          bean.close();
-        } else {
-          LOG.warn("No bean configured, or the wrong type?  Potential PermGen leak, or startup problem.");
-        }
-      }
-      catch (final IOException ex) {
-        LOG.error(StringUtils.stringifyException(ex));
-      }
-    }
-
-    public void contextInitialized(ServletContextEvent sce) {
-      final ServletContext app = sce.getServletContext();
-      final Configuration conf = NutchConfiguration.get(app);
-
-      LOG.info("creating new bean");
-      NutchBean bean = null;
-      try {
-        bean = new NutchBean(conf);
-        app.setAttribute(KEY, bean);
-      }
-      catch (final IOException ex) {
-        LOG.error(StringUtils.stringifyException(ex));
-      }
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/Hits.java
===================================================================
--- src/java/org/apache/nutch/searcher/Hits.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Hits.java	(working copy)
@@ -1,112 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.Text;
-
-/** A set of hits matching a query. */
-public final class Hits implements Writable {
-
-  private long total;
-  private boolean totalIsExact = true;
-  private Hit[] top;
-
-  public Hits() {}
-
-  public Hits(long total, Hit[] top) {
-    this.total = total;
-    this.top = top;
-  }
-
-  /** Returns the total number of hits for this query.  This may be an estimate
-   * when (@link #totalIsExact()} is false. */
-  public long getTotal() { return total; }
-
-  /** True if {@link #getTotal()} gives the exact number of hits, or false if
-   * it is only an estimate of the total number of hits. */
-  public boolean totalIsExact() { return totalIsExact; }
-
-  /** Set {@link #totalIsExact()}. */
-  public void setTotalIsExact(boolean isExact) { totalIsExact = isExact; }
-
-  /** Returns the number of hits included in this current listing. */
-  public int getLength() { return top.length; }
-
-  /** Returns the <code>i</code><sup>th</sup> hit in this list. */
-  public Hit getHit(int i) { return top[i]; }
-
-  /** Returns a subset of the hit objects. */
-  public Hit[] getHits(int start, int length) {
-    Hit[] results = new Hit[length];
-    for (int i = 0; i < length; i++) {
-      results[i] = top[start+i];
-    }
-    return results;
-  }
-
-  public void write(DataOutput out) throws IOException {
-    out.writeLong(total);                         // write total hits
-    out.writeInt(top.length);                     // write hits returned
-    if (top.length > 0)                           // write sort value class
-      Text.writeString(out, top[0].getSortValue().getClass().getName());
-                      
-    for (int i = 0; i < top.length; i++) {
-      Hit h = top[i];
-      Text.writeString(out, h.getUniqueKey());    // write uniqueKey
-      h.getSortValue().write(out);                // write sortValue
-      Text.writeString(out, h.getDedupValue());   // write dedupValue
-    }
-  }
-
-  @SuppressWarnings("unchecked")
-  public void readFields(DataInput in) throws IOException {
-    total = in.readLong();                        // read total hits
-    top = new Hit[in.readInt()];                  // read hits returned
-    Class sortClass = null;
-    if (top.length > 0) {                         // read sort value class
-      try {
-        sortClass = Class.forName(Text.readString(in));
-      } catch (ClassNotFoundException e) {
-        throw new IOException(e.toString());
-      }
-    }
-
-    for (int i = 0; i < top.length; i++) {
-      String uniqueKey = Text.readString(in);            // read uniqueKey
-
-      WritableComparable sortValue = null;
-      try {
-        sortValue = (WritableComparable)sortClass.newInstance();
-      } catch (Exception e) {
-        throw new IOException(e.toString());
-      }
-      sortValue.readFields(in);                   // read sortValue
-
-      String dedupValue = Text.readString(in);    // read dedupValue
-
-      top[i] = new Hit(uniqueKey, sortValue, dedupValue);
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/DistributedSearch.java
===================================================================
--- src/java/org/apache/nutch/searcher/DistributedSearch.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/DistributedSearch.java	(working copy)
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.ipc.RPC;
-import org.apache.nutch.util.NutchConfiguration;
-
-/** Search/summary servers. */
-public class DistributedSearch {
-
-  private DistributedSearch() {}                  // no public ctor
-
-  /** Runs a search/summary server. */
-  public static class Server {
-    public static void main(String[] args) throws Exception {
-      final String usage = "DistributedSearch$Server <port> <crawl dir>";
-
-      if (args.length == 0 || args.length > 2) {
-        System.err.println(usage);
-        System.exit(-1);
-      }
-
-      final int port = Integer.parseInt(args[0]);
-      final Path directory = new Path(args[1]);
-
-      final Configuration conf = NutchConfiguration.create();
-
-      final org.apache.hadoop.ipc.Server server =
-        getServer(conf, directory, port);
-      server.start();
-      server.join();
-    }
-
-    static org.apache.hadoop.ipc.Server getServer(Configuration conf,
-        Path directory, int port) throws IOException{
-      final NutchBean bean = new NutchBean(conf, directory);
-      final int numHandlers = conf.getInt("searcher.num.handlers", 10);
-      return RPC.getServer(bean, "0.0.0.0", port, numHandlers, true, conf);
-    }
-
-  }
-
-  public static class IndexServer {
-    /** Runs a lucene search server. */
-    public static void main(String[] args) throws Exception {
-      final String usage = "DistributedSearch$IndexServer <port> <crawl dir>";
-      if (args.length == 0 || args.length > 2) {
-        System.err.println(usage);
-        System.exit(-1);
-      }
-
-      final int port = Integer.parseInt(args[0]);
-      final Path dir = new Path(args[1]);
-
-      final Configuration conf = NutchConfiguration.create();
-
-      final LuceneSearchBean bean = new LuceneSearchBean(conf,
-          new Path(dir, "index"), new Path(dir, "indexes"));
-      final org.apache.hadoop.ipc.RPC.Server server =
-        RPC.getServer(bean, "0.0.0.0", port, 10, false, conf);
-      server.start();
-      server.join();
-    }
-  }
-
-  public static class SegmentServer {
-    /** Runs a summary server. */
-    public static void main(String[] args) throws Exception {
-      final String usage =
-        "DistributedSearch$SegmentServer <port> <crawl dir>";
-      if (args.length < 2) {
-        System.err.println(usage);
-        System.exit(1);
-      }
-
-      final Configuration conf = NutchConfiguration.create();
-      final int port = Integer.parseInt(args[0]);
-      final Path segmentsDir = new Path(args[1], "segments");
-
-      final FetchedSegments segments = new FetchedSegments(conf, segmentsDir);
-
-      final org.apache.hadoop.ipc.RPC.Server server =
-        RPC.getServer(segments, "0.0.0.0", port, conf);
-
-      server.start();
-      server.join();
-    }
-  }
-}
Index: src/java/org/apache/nutch/searcher/FetchedSegments.java
===================================================================
--- src/java/org/apache/nutch/searcher/FetchedSegments.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/FetchedSegments.java	(working copy)
@@ -1,331 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.mapred.lib.*;
-import org.apache.nutch.crawl.*;
-
-/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
- * fetched segments. */
-public class FetchedSegments implements RPCSegmentBean {
-
-  public static final long VERSION = 1L;
-
-  private static final ExecutorService executor =
-    Executors.newCachedThreadPool();
-
-  private class SummaryTask implements Callable<Summary> {
-    private final HitDetails details;
-    private final Query query;
-
-    public SummaryTask(HitDetails details, Query query) {
-      this.details = details;
-      this.query = query;
-    }
-
-    public Summary call() throws Exception {
-      return getSummary(details, query);
-    }
-  }
-
-  private class SegmentUpdater extends Thread {
-
-    private volatile boolean stopRequested = false;
-
-    @Override
-    public void interrupt() {
-      super.interrupt();
-      stopRequested = true;
-    }
-
-
-    @Override
-    public void run() {
-
-      while (!stopRequested && !Thread.currentThread().isInterrupted()) {
-        try {
-          final FileStatus[] fstats = fs.listStatus(segmentsDir,
-              HadoopFSUtil.getPassDirectoriesFilter(fs));
-          final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
-          final Iterator<Map.Entry<String, Segment>> i =
-            segments.entrySet().iterator();
-          while (i.hasNext()) {
-            final Map.Entry<String, Segment> entry = i.next();
-            final Segment seg = entry.getValue();
-            if (!fs.exists(seg.segmentDir)) {
-              try {
-                seg.close();
-              } catch (final Exception e) {
-                /* A segment may fail to close
-                 * since it may already be deleted from
-                 * file system. So we just ignore the
-                 * exception and remove the mapping from
-                 * 'segments'.
-                 */
-              } finally {
-                i.remove();
-              }
-            }
-          }
-
-          if (segmentDirs != null) {
-            for (final Path segmentDir : segmentDirs) {
-              segments.putIfAbsent(segmentDir.getName(),
-                  new Segment(fs, segmentDir, conf));
-            }
-          }
-
-          Thread.sleep(60000);
-        } catch (final InterruptedException e) {
-          // ignore
-        } catch (final IOException e) {
-          // ignore
-        }
-      }
-    }
-
-  }
-
-
-  private static class Segment implements java.io.Closeable {
-
-    private static final Partitioner<Text, Writable> PARTITIONER =
-      new HashPartitioner<Text, Writable>();
-
-    private final FileSystem fs;
-    private final Path segmentDir;
-
-    private MapFile.Reader[] content;
-    private MapFile.Reader[] parseText;
-    private MapFile.Reader[] parseData;
-    private MapFile.Reader[] crawl;
-    private final Configuration conf;
-
-    public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {
-      this.fs = fs;
-      this.segmentDir = segmentDir;
-      this.conf = conf;
-    }
-
-    public CrawlDatum getCrawlDatum(Text url) throws IOException {
-      synchronized (this) {
-        if (crawl == null)
-          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
-      }
-      return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
-    }
-
-    public byte[] getContent(Text url) throws IOException {
-      synchronized (this) {
-        if (content == null)
-          content = getReaders(Content.DIR_NAME);
-      }
-      return ((Content)getEntry(content, url, new Content())).getContent();
-    }
-
-    public ParseData getParseData(Text url) throws IOException {
-      synchronized (this) {
-        if (parseData == null)
-          parseData = getReaders(ParseData.DIR_NAME);
-      }
-      return (ParseData)getEntry(parseData, url, new ParseData());
-    }
-
-    public ParseText getParseText(Text url) throws IOException {
-      synchronized (this) {
-        if (parseText == null)
-          parseText = getReaders(ParseText.DIR_NAME);
-      }
-      return (ParseText)getEntry(parseText, url, new ParseText());
-    }
-
-    private MapFile.Reader[] getReaders(String subDir) throws IOException {
-      return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf);
-    }
-
-    private Writable getEntry(MapFile.Reader[] readers, Text url,
-                              Writable entry) throws IOException {
-      return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
-    }
-
-    public void close() throws IOException {
-      if (content != null) { closeReaders(content); }
-      if (parseText != null) { closeReaders(parseText); }
-      if (parseData != null) { closeReaders(parseData); }
-      if (crawl != null) { closeReaders(crawl); }
-    }
-
-    private void closeReaders(MapFile.Reader[] readers) throws IOException {
-      for (int i = 0; i < readers.length; i++) {
-        readers[i].close();
-      }
-    }
-
-  }
-
-  private final ConcurrentMap<String, Segment> segments =
-    new ConcurrentHashMap<String, Segment>();
-  private final FileSystem fs;
-  private final Configuration conf;
-  private final Path segmentsDir;
-  
-  // This must be nullable upon close, so do not declare final.
-  private SegmentUpdater segUpdater;
-  private final Summarizer summarizer;
-
-  /** Construct given a directory containing fetcher output. */
-  public FetchedSegments(Configuration conf, Path segmentsDir)
-  throws IOException {
-    this.conf = conf;
-    this.fs = FileSystem.get(this.conf);
-    final FileStatus[] fstats = fs.listStatus(segmentsDir,
-        HadoopFSUtil.getPassDirectoriesFilter(fs));
-    final Path[] segmentDirs = HadoopFSUtil.getPaths(fstats);
-    this.summarizer = new SummarizerFactory(this.conf).getSummarizer();
-    this.segmentsDir = segmentsDir;
-    this.segUpdater = new SegmentUpdater();
-
-    if (segmentDirs != null) {
-      for (final Path segmentDir : segmentDirs) {
-        segments.put(segmentDir.getName(),
-          new Segment(this.fs, segmentDir, this.conf));
-      }
-    }
-    this.segUpdater.start();
-  }
-
-  public String[] getSegmentNames() {
-    return segments.keySet().toArray(new String[segments.size()]);
-  }
-
-  public byte[] getContent(HitDetails details) throws IOException {
-    return getSegment(details).getContent(getUrl(details));
-  }
-
-  public ParseData getParseData(HitDetails details) throws IOException {
-    return getSegment(details).getParseData(getUrl(details));
-  }
-
-  public long getFetchDate(HitDetails details) throws IOException {
-    return getSegment(details).getCrawlDatum(getUrl(details))
-      .getFetchTime();
-  }
-
-  public ParseText getParseText(HitDetails details) throws IOException {
-    return getSegment(details).getParseText(getUrl(details));
-  }
-
-  public Summary getSummary(HitDetails details, Query query)
-    throws IOException {
-
-    if (this.summarizer == null) { return new Summary(); }
-
-    final Segment segment = getSegment(details);
-    final ParseText parseText = segment.getParseText(getUrl(details));
-    final String text = (parseText != null) ? parseText.getText() : "";
-
-    return this.summarizer.getSummary(text, query);
-  }
-
-  public long getProtocolVersion(String protocol, long clientVersion)
-  throws IOException {
-    return VERSION;
-  }
-
-  public Summary[] getSummary(HitDetails[] details, Query query)
-    throws IOException {
-    final List<Callable<Summary>> tasks =
-      new ArrayList<Callable<Summary>>(details.length);
-    for (int i = 0; i < details.length; i++) {
-      tasks.add(new SummaryTask(details[i], query));
-    }
-
-    List<Future<Summary>> summaries;
-    try {
-      summaries = executor.invokeAll(tasks);
-    } catch (final InterruptedException e) {
-      throw new RuntimeException(e);
-    }
-
-
-    final Summary[] results = new Summary[details.length];
-    for (int i = 0; i < details.length; i++) {
-      final Future<Summary> f = summaries.get(i);
-      Summary summary;
-      try {
-        summary = f.get();
-      } catch (final Exception e) {
-        if (e.getCause() instanceof IOException) {
-          throw (IOException) e.getCause();
-        }
-        throw new RuntimeException(e);
-      }
-      results[i] = summary;
-    }
-    return results;
-  }
-
-
-  private Segment getSegment(HitDetails details) {
-    return segments.get(details.getValue("segment"));
-  }
-
-  private Text getUrl(HitDetails details) {
-    String url = details.getValue("orig");
-    if (StringUtils.isBlank(url)) {
-      url = details.getValue("url");
-    }
-    return new Text(url);
-  }
-
-  public void close() throws IOException {
-    // Interrupt that thread to convince it to stop running.
-    segUpdater.interrupt();
-
-    // Break reference cycle, otherwise this points to segUpdater, and 
-    // segUpdater.$0 points to this.  It appeared to keep the thread from
-    // being GC'ed/reaped.
-    segUpdater = null;
-    final Iterator<Segment> iterator = segments.values().iterator();
-    while (iterator.hasNext()) {
-      iterator.next().close();
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/HitInlinks.java
===================================================================
--- src/java/org/apache/nutch/searcher/HitInlinks.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/HitInlinks.java	(working copy)
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.Closeable;
-import org.apache.nutch.crawl.Inlinks;
-
-/** Service that returns information about incoming links to a hit. */
-public interface HitInlinks extends Closeable {
-  /** Returns the anchors of a hit document. */
-  String[] getAnchors(HitDetails details) throws IOException;
-
-  /** Return the inlinks of a hit document. */
-  Inlinks getInlinks(HitDetails details) throws IOException;
-}
Index: src/java/org/apache/nutch/searcher/package.html
===================================================================
--- src/java/org/apache/nutch/searcher/package.html	(revision 959954)
+++ src/java/org/apache/nutch/searcher/package.html	(working copy)
@@ -1,5 +0,0 @@
-<html>
-<body>
-Search API
-</body>
-</html>
Index: src/java/org/apache/nutch/searcher/HitContent.java
===================================================================
--- src/java/org/apache/nutch/searcher/HitContent.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/HitContent.java	(working copy)
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-
-import org.apache.hadoop.io.Closeable;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseText;
-
-/** Service that returns the content of a hit. */
-public interface HitContent extends Closeable {
-  /** Returns the content of a hit document. */
-  byte[] getContent(HitDetails details) throws IOException;
-
-  /** Returns the ParseData of a hit document. */
-  ParseData getParseData(HitDetails details) throws IOException;
-
-  /** Returns the ParseText of a hit document. */
-  ParseText getParseText(HitDetails details) throws IOException;
-
-  /** Returns the fetch date of a hit document. */
-  long getFetchDate(HitDetails details) throws IOException;
-
-}
Index: src/java/org/apache/nutch/searcher/Hit.java
===================================================================
--- src/java/org/apache/nutch/searcher/Hit.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Hit.java	(working copy)
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-
-/** A document which matched a query in an index. */
-@SuppressWarnings("unchecked")
-public class Hit implements Writable, Comparable<Hit> {
-
-  private int indexNo;                            // index id
-  private String uniqueKey;
-  private WritableComparable sortValue;           // value sorted on
-  private String dedupValue;                      // value to dedup on
-  private boolean moreFromDupExcluded;
-
-  public Hit() {}
-
-  public Hit(int indexNo, String uniqueKey) {
-    this(indexNo, uniqueKey, null, null);
-  }
-  public Hit(int indexNo, String uniqueKey,
-      WritableComparable sortValue,
-             String dedupValue) {
-    this(uniqueKey, sortValue, dedupValue);
-    this.indexNo = indexNo;
-  }
-  public Hit(String uniqueKey, WritableComparable sortValue, String dedupValue) {
-    this.uniqueKey = uniqueKey;
-    this.sortValue = sortValue;
-    this.dedupValue = dedupValue == null ? "" : dedupValue;
-  }
-
-  /** Return the index number that this hit came from. */
-  public int getIndexNo() { return indexNo; }
-  public void setIndexNo(int indexNo) { this.indexNo = indexNo; }
-
-  /** Return the unique identifier of this hit within an index. */
-  public String getUniqueKey() { return uniqueKey; }
-
-  /** Return the value of the field that hits are sorted on. */
-  public WritableComparable getSortValue() { return sortValue; }
-
-  /** Return the value of the field that hits should be deduplicated on. */
-  public String getDedupValue() { return dedupValue; }
-
-  /** True if other, lower-scoring, hits with the same dedup value have been
-   * excluded from the list which contains this hit.. */
-  public boolean moreFromDupExcluded() { return moreFromDupExcluded; }
-
-  /** True if other, lower-scoring, hits with the same dedup value have been
-   * excluded from the list which contains this hit.. */
-  public void setMoreFromDupExcluded(boolean more){moreFromDupExcluded=more;}
-
-  /** Display as a string. */
-  public String toString() {
-    return "#" + uniqueKey;
-  }
-
-  public int compareTo(Hit other) {
-    int compare = sortValue.compareTo(other.sortValue);
-    if (compare != 0) {
-      return compare;                             // use sortValue
-    } else if (other.indexNo != this.indexNo) {
-      return other.indexNo - this.indexNo;        // prefer later indexes
-    } else {
-      return other.uniqueKey.compareTo(this.uniqueKey);  // prefer later doc
-    }
-  }
-
-  public void write(DataOutput out) throws IOException {
-    Text.writeString(out, uniqueKey);
-  }
-
-  public void readFields(DataInput in) throws IOException {
-    uniqueKey = Text.readString(in);
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/IndexSearcher.java
===================================================================
--- src/java/org/apache/nutch/searcher/IndexSearcher.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/IndexSearcher.java	(working copy)
@@ -1,186 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Fieldable;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiReader;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.nutch.indexer.FsDirectory;
-import org.apache.nutch.indexer.NutchSimilarity;
-
-/** Implements {@link Searcher} and {@link HitDetailer} for either a single
- * merged index, or a set of indexes. */
-public class IndexSearcher implements Searcher, HitDetailer {
-
-  private org.apache.lucene.search.Searcher luceneSearcher;
-  private org.apache.lucene.index.IndexReader reader;
-  private LuceneQueryOptimizer optimizer;
-  private FileSystem fs;
-  private Configuration conf;
-  private QueryFilters queryFilters;
-
-  /** Construct given a number of indexes. */
-  public IndexSearcher(Path[] indexDirs, Configuration conf) throws IOException {
-    IndexReader[] readers = new IndexReader[indexDirs.length];
-    this.conf = conf;
-    this.fs = FileSystem.get(conf);
-    for (int i = 0; i < indexDirs.length; i++) {
-      readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
-    }
-    init(new MultiReader(readers), conf);
-  }
-
-  /** Construct given a single merged index. */
-  public IndexSearcher(Path index,  Configuration conf)
-    throws IOException {
-    this.conf = conf;
-    this.fs = FileSystem.get(conf);
-    init(IndexReader.open(getDirectory(index)), conf);
-  }
-
-  private void init(IndexReader reader, Configuration conf) throws IOException {
-    this.reader = reader;
-    this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
-    this.luceneSearcher.setSimilarity(new NutchSimilarity());
-    this.optimizer = new LuceneQueryOptimizer(conf);
-    this.queryFilters = new QueryFilters(conf);
-  }
-
-  private Directory getDirectory(Path file) throws IOException {
-    if ("file".equals(this.fs.getUri().getScheme())) {
-      Path qualified = file.makeQualified(FileSystem.getLocal(conf));
-      File fsLocal = new File(qualified.toUri());
-      return FSDirectory.open(new File(fsLocal.getAbsolutePath()));
-    } else {
-      return new FsDirectory(this.fs, file, false, this.conf);
-    }
-  }
-
-  @Override
-  @Deprecated
-  public Hits search(Query query, int numHits,
-                     String dedupField, String sortField, boolean reverse)
-
-    throws IOException {
-    query.setParams(new QueryParams(numHits,
-        QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse));
-    return search(query);
-  }
-  
-  @Override
-  public Hits search(Query query) throws IOException {
-    org.apache.lucene.search.BooleanQuery luceneQuery =
-      this.queryFilters.filter(query);
-    return translateHits(optimizer.optimize(luceneQuery, luceneSearcher, query
-        .getParams().getNumHits(), query.getParams().getSortField(), query
-        .getParams().isReverse()), query.getParams().getDedupField(), query
-        .getParams().getSortField());
-  }
-
-
-  public String getExplanation(Query query, Hit hit) throws IOException {
-    return luceneSearcher.explain(this.queryFilters.filter(query),
-        Integer.valueOf(hit.getUniqueKey())).toHtml();
-  }
-
-  public HitDetails getDetails(Hit hit) throws IOException {
-
-    Document doc = luceneSearcher.doc(Integer.valueOf(hit.getUniqueKey()));
-
-    List<Fieldable> docFields = doc.getFields();
-    String[] fields = new String[docFields.size()];
-    String[] values = new String[docFields.size()];
-    for (int i = 0; i < docFields.size(); i++) {
-      Fieldable field = docFields.get(i);
-      fields[i] = field.name();
-      values[i] = field.stringValue();
-    }
-
-    return new HitDetails(fields, values);
-  }
-
-  public HitDetails[] getDetails(Hit[] hits) throws IOException {
-    HitDetails[] results = new HitDetails[hits.length];
-    for (int i = 0; i < hits.length; i++)
-      results[i] = getDetails(hits[i]);
-    return results;
-  }
-
-  private Hits translateHits(TopDocs topDocs,
-                             String dedupField, String sortField)
-    throws IOException {
-
-    String[] dedupValues = null;
-    if (dedupField != null) 
-      dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField);
-
-    ScoreDoc[] scoreDocs = topDocs.scoreDocs;
-    int length = scoreDocs.length;
-    Hit[] hits = new Hit[length];
-    for (int i = 0; i < length; i++) {
-      
-      int doc = scoreDocs[i].doc;
-      
-      WritableComparable sortValue;               // convert value to writable
-      if (sortField == null) {
-        sortValue = new FloatWritable(scoreDocs[i].score);
-      } else {
-        Object raw = ((FieldDoc)scoreDocs[i]).fields[0];
-        if (raw instanceof Integer) {
-          sortValue = new IntWritable(((Integer)raw).intValue());
-        } else if (raw instanceof Float) {
-          sortValue = new FloatWritable(((Float)raw).floatValue());
-        } else if (raw instanceof String) {
-          sortValue = new Text((String)raw);
-        } else {
-          throw new RuntimeException("Unknown sort value type!");
-        }
-      }
-
-      String dedupValue = dedupValues == null ? null : dedupValues[doc];
-
-      hits[i] = new Hit(Integer.toString(doc), sortValue, dedupValue);
-    }
-    return new Hits(topDocs.totalHits, hits);
-  }
-  
-  public void close() throws IOException {
-    if (luceneSearcher != null) { luceneSearcher.close(); }
-    if (reader != null) { reader.close(); }
-  }
-
-}
Index: src/java/org/apache/nutch/searcher/RPCSegmentBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/RPCSegmentBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/RPCSegmentBean.java	(working copy)
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import org.apache.hadoop.ipc.VersionedProtocol;
-
-public interface RPCSegmentBean extends SegmentBean, VersionedProtocol {
-
-}
Index: src/java/org/apache/nutch/searcher/Summary.java
===================================================================
--- src/java/org/apache/nutch/searcher/Summary.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Summary.java	(working copy)
@@ -1,221 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-// JDK imports
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-
-// Hadoop imports
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-// Nutch imports
-import org.apache.nutch.html.Entities;
-
-
-/** A document summary dynamically generated to match a query. */
-public class Summary implements Writable {
-  
-  private final static int FRAGMENT  = 0;
-  private final static int HIGHLIGHT = 1;
-  private final static int ELLIPSIS  = 2;
-  
-  /** A fragment of text within a summary. */
-  public static class Fragment {
-    private String text;
-
-    /** Constructs a fragment for the given text. */
-    public Fragment(String text) { this.text = text; }
-
-    /** Returns the text of this fragment. */
-    public String getText() { return text; }
-
-    /** Returns true iff this fragment is to be highlighted. */
-    public boolean isHighlight() { return false; }
-
-    /** Returns true iff this fragment is an ellipsis. */
-    public boolean isEllipsis() { return false; }
-
-    /** Returns a textual representation of this fragment. */
-    public String toString() { return getText(); }
-
-    // Inherited Javadoc
-    public boolean equals(Object o) {
-      try {
-        Fragment f = (Fragment) o;
-        return f.getText().equals(getText()) && 
-               f.isHighlight() == isHighlight() &&
-               f.isEllipsis() == isEllipsis();
-      } catch (Exception e) {
-        return false;
-      }
-    }
-  }
-
-  /** A highlighted fragment of text within a summary. */
-  public static class Highlight extends Fragment {
-    /** Constructs a highlighted fragment for the given text. */
-    public Highlight(String text) { super(text); }
-
-    /** Returns true. */
-    public boolean isHighlight() { return true; }
-  }
-
-  /** An ellipsis fragment within a summary. */
-  public static class Ellipsis extends Fragment {
-    /** Constructs an ellipsis fragment for the given text. */
-    public Ellipsis() { super(" ... "); }
-
-    /** Returns true. */
-    public boolean isEllipsis() { return true; }
-  }
-
-  private ArrayList<Fragment> fragments = new ArrayList<Fragment>();
-
-  private static final Fragment[] FRAGMENT_PROTO = new Fragment[0];
-
-  /** Constructs an empty Summary.*/
-  public Summary() {}
-
-  /** Adds a fragment to a summary.*/
-  public void add(Fragment fragment) { fragments.add(fragment); }
-
-  /** Returns an array of all of this summary's fragments.*/
-  public Fragment[] getFragments() {
-    return fragments.toArray(FRAGMENT_PROTO);
-  }
-
-  /** Returns a String representation of this Summary. */
-  public String toString() {
-    StringBuffer buffer = new StringBuffer();
-    for (int i = 0; i < fragments.size(); i++) {
-      buffer.append(fragments.get(i));
-    }
-    return buffer.toString();
-  }
-
-  /**
-   * Returns a HTML representation of this Summary.
-   * HTML output for <b>Highlight</b> fragments is
-   * <code>&lt;span class="highlight"&gt;highlight's text&lt;/span&gt;</code>,
-   * for <b>Ellipsis</b> fragments is
-   * <code>&lt;span class="highlight"&gt; ... &lt;/span&gt;</code>, for generic
-   * <b>Fragment</b> is simply the fragment's text.<br/>
-   *
-   * @param encode specifies if the summary's entities should be encoded.
-   */
-  public String toHtml(boolean encode) {
-    Fragment fragment = null;
-    StringBuffer buf = new StringBuffer();
-    for (int i=0; i<fragments.size(); i++) {
-      fragment = fragments.get(i);
-      if (fragment.isHighlight()) {
-        buf.append("<span class=\"highlight\">")
-           .append(encode ? Entities.encode(fragment.getText())
-                          : fragment.getText())
-           .append("</span>");
-      } else if (fragment.isEllipsis()) {
-        buf.append("<span class=\"ellipsis\"> ... </span>");
-      } else {
-        buf.append(encode ? Entities.encode(fragment.getText())
-                          : fragment.getText());
-      }
-    }
-    return buf.toString();
-  }
-  
-  // Inherited Javadoc
-  public boolean equals(Object o) {
-    if (!(o instanceof Summary)) { return false; }
-    Fragment[] fragments1 = ((Summary) o).getFragments();
-    Fragment[] fragments2 = getFragments();
-    if (fragments1.length != fragments2.length) { return false; }
-    for (int i=0; i<fragments1.length; i++) {
-      if (!fragments1[i].equals(fragments2[i])) {
-        return false;
-      }
-    }
-    return true;
-  }
-  
-  /**
-   * Helper method that return a String representation for each
-   * specified Summary.
-   */ 
-  public static String[] toStrings(Summary[] summaries) {
-    if (summaries == null) { return null; }
-    String[] strs = new String[summaries.length];
-    for (int i=0; i<summaries.length; i++) {
-      strs[i] = summaries[i].toString();
-    }
-    return strs;
-  }
-
-  public static Summary read(DataInput in) throws IOException {
-    Summary summary = new Summary();
-    summary.readFields(in);
-    return summary;
-  }
-
-  
-  /* ------------------------- *
-   * <implementation:Writable> *
-   * ------------------------- */
-
-  // Inherited Javadoc
-  public void write(DataOutput out) throws IOException {
-    out.writeInt(fragments.size());
-    Fragment fragment = null;
-    for (int i=0; i<fragments.size(); i++) {
-      fragment = fragments.get(i);
-      if (fragment.isHighlight()) {
-        out.writeByte(HIGHLIGHT);
-        Text.writeString(out, fragment.getText());
-      } else if (fragment.isEllipsis()) {
-        out.writeByte(ELLIPSIS);
-      } else {
-        out.writeByte(FRAGMENT);
-        Text.writeString(out, fragment.getText());
-      }
-    }
-  }
-
-  // Inherited Javadoc
-  public void readFields(DataInput in) throws IOException {
-    int nbFragments = in.readInt();
-    Fragment fragment = null;
-    for (int i=0; i<nbFragments; i++) {
-      int type = in.readByte();
-      if (type == HIGHLIGHT) {
-        fragment = new Highlight(Text.readString(in));
-      } else if (type == ELLIPSIS) {
-        fragment = new Ellipsis();
-      } else {
-        fragment = new Fragment(Text.readString(in));
-      }
-      fragments.add(fragment);
-    }
-  }
-  
-  /* -------------------------- *
-   * </implementation:Writable> *
-   * -------------------------- */
-
-}
Index: src/java/org/apache/nutch/searcher/Summarizer.java
===================================================================
--- src/java/org/apache/nutch/searcher/Summarizer.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/Summarizer.java	(working copy)
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configurable;
-
-// Nutch imports
-import org.apache.nutch.plugin.Pluggable;
-
-
-/** 
- * Extension point for summarizer.
- *
- * @author J&eacute;r&ocirc;me Charron
- */
-public interface Summarizer extends Configurable, Pluggable {
-
-  /** The name of the extension point. */
-  public final static String X_POINT_ID = Summarizer.class.getName();
-  
-  /**
-   * Get a summary for a specified text.
-   * @param text is the text to summarize.
-   * @param query is the query for which the text is a hit.
-   */
-  public Summary getSummary(String text, Query query);
-
-}
Index: src/java/org/apache/nutch/searcher/DistributedSearchBean.java
===================================================================
--- src/java/org/apache/nutch/searcher/DistributedSearchBean.java	(revision 959954)
+++ src/java/org/apache/nutch/searcher/DistributedSearchBean.java	(working copy)
@@ -1,357 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher;
-
-import java.io.IOException;
-import java.net.InetSocketAddress;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.PriorityQueue;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.ipc.RPC;
-import org.apache.hadoop.util.StringUtils;
-
-public class DistributedSearchBean implements SearchBean {
-
-  private static final ExecutorService executor =
-    Executors.newCachedThreadPool();
-
-  private final ScheduledExecutorService pingService;
-
-  private class SearchTask implements Callable<Hits> {
-    private int id;
-
-    private Query query;
-
-    public SearchTask(int id) {
-      this.id = id;
-    }
-
-    public Hits call() throws Exception {
-      if (!liveServers[id]) {
-        return null;
-      }
-      return beans[id].search(query);
-    }
-
-    /**
-     * @deprecated since 1.1, use {@link #setSearchArgs(Query)} instead
-     */
-    public void setSearchArgs(Query query, int numHits, String dedupField,
-                              String sortField, boolean reverse) {
-      this.query = query;
-      query.setParams(new QueryParams(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse));
-    }
-
-    private void setSearchArgs(Query query) {
-      this.query = query;
-    }
-
-  }
-
-  private class DetailTask implements Callable<HitDetails[]> {
-    private int id;
-
-    private Hit[] hits;
-
-    public DetailTask(int id) {
-     this.id = id;
-    }
-
-    public HitDetails[] call() throws Exception {
-      if (hits == null) {
-        return null;
-      }
-      return beans[id].getDetails(hits);
-    }
-
-    public void setHits(Hit[] hits) {
-      this.hits = hits;
-    }
-
-  }
-
-  private class PingWorker implements Runnable {
-    private int id;
-
-    public PingWorker(int id) {
-      this.id = id;
-    }
-
-    public void run()  {
-      try {
-        if (beans[id].ping()) {
-          liveServers[id] = true;
-        } else {
-          liveServers[id] = false;
-        }
-      } catch (IOException e) {
-        liveServers[id] = false;
-      }
-    }
-  }
-
-  private volatile boolean liveServers[];
-
-  private SearchBean[] beans;
-
-  private List<Callable<Hits>> searchTasks;
-
-  private List<Callable<HitDetails[]>> detailTasks;
-
-  private List<PingWorker> pingWorkers;
-
-  private long timeout;
-
-  public DistributedSearchBean(Configuration conf,
-                               Path luceneConfig, Path solrConfig)
-  throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-
-    this.timeout = conf.getLong("ipc.client.timeout", 60000);
-
-    List<SearchBean> beanList = new ArrayList<SearchBean>();
-
-    if (fs.exists(luceneConfig)) {
-      LOG.info("Adding Nutch searchers in " +
-              luceneConfig.makeQualified(fs).toUri());
-      addLuceneBeans(beanList, luceneConfig, conf);
-    }
-
-    if (fs.exists(solrConfig)) {
-      LOG.info("Adding Solr searchers in " +
-              solrConfig.makeQualified(fs).toUri());
-      addSolrBeans(beanList, solrConfig, conf);
-    }
-    LOG.info("Added " + beanList.size() + " remote searchers.");
-
-    beans = beanList.toArray(new SearchBean[beanList.size()]);
-
-    liveServers = new boolean[beans.length];
-    for (int i = 0; i < liveServers.length; i++) {
-      liveServers[i] = true;
-    }
-
-    searchTasks = new ArrayList<Callable<Hits>>();
-    detailTasks = new ArrayList<Callable<HitDetails[]>>();
-    pingWorkers = new ArrayList<PingWorker>();
-
-    for (int i = 0; i < beans.length; i++) {
-      searchTasks.add(new SearchTask(i));
-      detailTasks.add(new DetailTask(i));
-      pingWorkers.add(new PingWorker(i));
-    }
-
-    pingService = Executors.newScheduledThreadPool(beans.length);
-    for (PingWorker worker : pingWorkers) {
-      pingService.scheduleAtFixedRate(worker, 0, 10, TimeUnit.SECONDS);
-    }
-
-  }
-
-  private static void addLuceneBeans(List<SearchBean> beanList,
-                                     Path luceneConfig, Configuration conf)
-  throws IOException {
-    Configuration newConf = new Configuration(conf);
-
-    // do not retry connections
-    newConf.setInt("ipc.client.connect.max.retries", 0);
-
-    List<InetSocketAddress> luceneServers =
-      NutchBean.readAddresses(luceneConfig, conf);
-    for (InetSocketAddress addr : luceneServers) {
-      beanList.add((RPCSearchBean) RPC.getProxy(RPCSearchBean.class,
-          LuceneSearchBean.VERSION, addr, newConf));
-    }
-  }
-
-  private static void addSolrBeans(List<SearchBean> beanList,
-                                   Path solrConfig, Configuration conf)
-  throws IOException {
-    for (String solrServer : NutchBean.readConfig(solrConfig, conf)) {
-      beanList.add(new SolrSearchBean(conf, solrServer));
-    }
-  }
-
-  public String getExplanation(Query query, Hit hit) throws IOException {
-    return beans[hit.getIndexNo()].getExplanation(query, hit);
-  }
-
-  @Override
-  public Hits search(Query query) throws IOException {
-    for (Callable<Hits> task : searchTasks) {
-      ((SearchTask)task).setSearchArgs(query);
-    }
-
-    List<Future<Hits>> allHits;
-    try {
-      allHits =
-        executor.invokeAll(searchTasks, timeout, TimeUnit.MILLISECONDS);
-    } catch (InterruptedException e) {
-      throw new RuntimeException(e);
-    }
-
-    PriorityQueue<Hit> queue;            // cull top hits from results
-    if (query.getParams().getSortField() == null
-        || query.getParams().isReverse()) {
-      queue = new PriorityQueue<Hit>(query.getParams().getNumHits());
-    } else {
-      queue = new PriorityQueue<Hit>(query.getParams().getNumHits(),
-          new Comparator<Hit>() {
-        public int compare(Hit h1, Hit h2) {
-          return h2.compareTo(h1); // reverse natural order
-        }
-      });
-    }
-
-    long totalHits = 0;
-    int allHitsSize = allHits.size();
-    for (int i = 0; i < allHitsSize; i++) {
-      Hits hits = null;
-      try {
-        hits = allHits.get(i).get();
-      } catch (InterruptedException e) {
-        // ignore
-      } catch (ExecutionException e) {
-        LOG.warn("Retrieving hits failed with exception: " +
-                 StringUtils.stringifyException(e.getCause()));
-      }
-
-      if (hits == null) {
-        continue;
-      }
-
-      totalHits += hits.getTotal();
-
-      int hitsLength = hits.getLength();
-      for (int j = 0; j < hitsLength; j++) {
-        Hit hit = hits.getHit(j);
-        Hit newHit = new Hit(i, hit.getUniqueKey(),
-                             hit.getSortValue(), hit.getDedupValue());
-        queue.add(newHit);
-        if (queue.size() > query.getParams().getNumHits()) {
-          // if hit queue overfull
-          queue.remove();
-        }
-      }
-    }
-
-    // we have to sort results since PriorityQueue.toArray
-    // may not return results in sorted order
-    Hit[] culledResults = queue.toArray(new Hit[queue.size()]);
-    Arrays.sort(culledResults, Collections.reverseOrder(queue.comparator()));
-
-    return new Hits(totalHits, culledResults);
-  }
-
-  @Override
-  @Deprecated
-  public Hits search(Query query, int numHits, String dedupField,
-                     String sortField, boolean reverse) throws IOException {
-
-    query.setParams(new QueryParams(numHits, QueryParams.DEFAULT_MAX_HITS_PER_DUP, dedupField, sortField, reverse));
-    return search(query);
-  }
-
-  public void close() throws IOException {
-    executor.shutdown();
-    pingService.shutdown();
-    for (SearchBean bean : beans) {
-      bean.close();
-    }
-  }
-
-  public HitDetails getDetails(Hit hit) throws IOException {
-    return beans[hit.getIndexNo()].getDetails(hit);
-  }
-
-  @SuppressWarnings("unchecked")
-  public HitDetails[] getDetails(Hit[] hits) throws IOException {
-    List<Hit>[] hitList = new ArrayList[detailTasks.size()];
-
-    for (int i = 0; i < hitList.length; i++) {
-      hitList[i] = new ArrayList<Hit>();
-    }
-
-    for (int i = 0; i < hits.length; i++) {
-      Hit hit = hits[i];
-      hitList[hit.getIndexNo()].add(hit);
-    }
-
-    for (int i = 0; i < detailTasks.size(); i++) {
-      DetailTask task = (DetailTask)detailTasks.get(i);
-      if (hitList[i].size() > 0) {
-        task.setHits(hitList[i].toArray(new Hit[hitList[i].size()]));
-      } else {
-        task.setHits(null);
-      }
-    }
-
-    List<Future<HitDetails[]>> allDetails;
-    try {
-      allDetails =
-        executor.invokeAll(detailTasks, timeout, TimeUnit.MILLISECONDS);
-    } catch (InterruptedException e) {
-      throw new RuntimeException(e);
-    }
-
-    /* getDetails(Hit[]) method assumes that HitDetails[i] returned corresponds
-     * to Hit[i] given as parameter. To keep this order, we have to 'merge'
-     * HitDetails[] returned from individual detailTasks.
-     */
-    HitDetails[][] detailsMatrix = new HitDetails[detailTasks.size()][];
-    for (int i = 0; i < detailsMatrix.length; i++) {
-      try {
-        detailsMatrix[i] = allDetails.get(i).get();
-      } catch (InterruptedException e) {
-        throw new RuntimeException(e);
-      } catch (ExecutionException e) {
-        if (e.getCause() instanceof IOException) {
-          throw (IOException) e.getCause();
-        }
-        throw new RuntimeException(e);
-      }
-    }
-
-    int[] hitPos = new int[detailTasks.size()]; // keep track of where we are
-    HitDetails[] detailsArr = new HitDetails[hits.length];
-    for (int i = 0; i < detailsArr.length; i++) {
-      int indexNo = hits[i].getIndexNo();
-      detailsArr[i] = detailsMatrix[indexNo][(hitPos[indexNo]++)];
-    }
-
-    return detailsArr;
-  }
-
-  public boolean ping() {
-    return true; // not used
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
===================================================================
--- src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java	(working copy)
@@ -39,7 +39,6 @@
 import org.apache.hadoop.mapred.lib.NullOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.indexer.DeleteDuplicates;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.solr.client.solrj.SolrQuery;
Index: src/java/org/apache/nutch/indexer/solr/SolrWriter.java
===================================================================
--- src/java/org/apache/nutch/indexer/solr/SolrWriter.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/solr/SolrWriter.java	(working copy)
@@ -23,6 +23,7 @@
 
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
 import org.apache.nutch.indexer.NutchIndexWriter;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
@@ -47,16 +48,16 @@
 
   public void write(NutchDocument doc) throws IOException {
     final SolrInputDocument inputDoc = new SolrInputDocument();
-    for(final Entry<String, List<String>> e : doc) {
-      for (final String val : e.getValue()) {
-        inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+    for(final Entry<String, NutchField> e : doc) {
+      for (final Object val : e.getValue().getValues()) {
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val, e.getValue().getWeight());
         String sCopy = solrMapping.mapCopyKey(e.getKey());
         if (sCopy != e.getKey()) {
         	inputDoc.addField(sCopy, val);	
         }
       }
     }
-    inputDoc.setDocumentBoost(doc.getScore());
+    inputDoc.setDocumentBoost(doc.getWeight());
     inputDocs.add(inputDoc);
     if (inputDocs.size() > commitSize) {
       try {
Index: src/java/org/apache/nutch/indexer/field/FieldsWritable.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldsWritable.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldsWritable.java	(working copy)
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.hadoop.io.Writable;
-
-/**
- * A class that holds a grouping of FieldWritable objects.
- */
-public class FieldsWritable
-  implements Writable {
-
-  private List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();
-
-  public FieldsWritable() {
-
-  }
-  
-  public boolean hasField(String name) {
-    for (FieldWritable field : fieldsList) {
-      if (field.getName().equals(name)) {
-        return true;
-      }
-    }
-    return false;
-  }
-  
-  public FieldWritable getField(String name) {
-    for (FieldWritable field : fieldsList) {
-      if (field.getName().equals(name)) {
-        return field;
-      }
-    }
-    return null;
-  }
-  
-  public List<FieldWritable> getFields(String name) {
-    List<FieldWritable> named = new ArrayList<FieldWritable>();
-    for (FieldWritable field : fieldsList) {
-      if (field.getName().equals(name)) {
-        named.add(field);
-      }
-    }
-    return named.size() > 0 ? named : null;
-  }
-  
-  public List<FieldWritable> getFieldsList() {
-    return fieldsList;
-  }
-
-  public void setFieldsList(List<FieldWritable> fieldsList) {
-    this.fieldsList = fieldsList;
-  }
-
-  public void readFields(DataInput in)
-    throws IOException {
-    fieldsList.clear();
-    int numFields = in.readInt();
-    for (int i = 0; i < numFields; i++) {
-      FieldWritable field = new FieldWritable();
-      field.readFields(in);
-      fieldsList.add(field);
-    }
-  }
-
-  public void write(DataOutput out)
-    throws IOException {
-    int numFields = fieldsList.size();
-    out.writeInt(numFields);
-    for (int i = 0; i < numFields; i++) {
-      fieldsList.get(i).write(out);
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/field/AnchorFields.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/AnchorFields.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/AnchorFields.java	(working copy)
@@ -1,421 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.LinkDatum;
-import org.apache.nutch.scoring.webgraph.Node;
-import org.apache.nutch.scoring.webgraph.WebGraph;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/**
- * Creates FieldWritable objects for inbound anchor text.   These FieldWritable
- * objects are then included in the input to the FieldIndexer to be converted
- * to Lucene Field objects and indexed.
- * 
- * Any empty or null anchor text is ignored. Anchors are sorted in descending
- * order according to the score of their parent pages. There are settings for a
- * maximum number of anchors to index and whether those anchors should be stored
- * and tokenized. With a descending order by score and a maximum anchors index
- * we ensure that only the best anchors are indexed assuming that a higher link
- * analysis score equals a better page and better inbound text.
- */
-public class AnchorFields
-  extends Configured
-  implements Tool {
-
-  public static final Log LOG = LogFactory.getLog(AnchorFields.class);
-
-  /**
-   * Comparator to order the links in descending order by score.
-   */
-  private static class DescendinLinkDatumScoreComparator
-    implements Comparator<LinkDatum> {
-
-    public int compare(LinkDatum one, LinkDatum two) {
-      float scoreOne = one.getScore();
-      float scoreTwo = two.getScore();
-      return (scoreOne == scoreTwo ? 0 : (scoreOne > scoreTwo ? -1 : 1));
-    }
-  }
-
-  /**
-   * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
-   * and null anchors.
-   * 
-   * @param webGraphDb The WebGraphDb to pull from.
-   * @param output The extractor output.
-   * 
-   * @throws IOException If an error occurs while running the extractor.
-   */
-  private void runExtractor(Path webGraphDb, Path output)
-    throws IOException {
-
-    JobConf extractor = new NutchJob(getConf());
-    extractor.setJobName("AnchorFields Extractor");
-    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
-      WebGraph.OUTLINK_DIR));
-    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
-      WebGraph.NODE_DIR));
-    FileOutputFormat.setOutputPath(extractor, output);
-    extractor.setInputFormat(SequenceFileInputFormat.class);
-    extractor.setMapperClass(Extractor.class);
-    extractor.setReducerClass(Extractor.class);
-    extractor.setMapOutputKeyClass(Text.class);
-    extractor.setMapOutputValueClass(ObjectWritable.class);
-    extractor.setOutputKeyClass(Text.class);
-    extractor.setOutputValueClass(LinkDatum.class);
-    extractor.setOutputFormat(SequenceFileOutputFormat.class);
-
-    LOG.info("Starting extractor job");
-    try {
-      JobClient.runJob(extractor);
-    }
-    catch (IOException e) {
-      LOG.error(StringUtils.stringifyException(e));
-      throw e;
-    }
-    LOG.info("Finished extractor job.");
-  }
-
-  /**
-   * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
-   * the highest scoring into FieldWritable objects.  Only inlinks for which
-   * basic fields exist will be collected to avoid orphan fields.
-   * 
-   * @param basicFields The BasicFields which must be present to collect anchors
-   * to avoid orphan fields.
-   * @param links The outlinks path.
-   * @param output The collector output.
-   * 
-   * @throws IOException If an error occurs while running the collector.
-   */
-  private void runCollector(Path basicFields, Path links, Path output)
-    throws IOException {
-
-    JobConf collector = new NutchJob(getConf());
-    collector.setJobName("AnchorFields Collector");
-    FileInputFormat.addInputPath(collector, links);
-    FileInputFormat.addInputPath(collector, basicFields);
-    FileOutputFormat.setOutputPath(collector, output);
-    collector.setInputFormat(SequenceFileInputFormat.class);
-    collector.setMapOutputKeyClass(Text.class);
-    collector.setMapOutputValueClass(ObjectWritable.class);
-    collector.setMapperClass(Collector.class);
-    collector.setReducerClass(Collector.class);
-    collector.setOutputKeyClass(Text.class);
-    collector.setOutputValueClass(FieldWritable.class);
-    collector.setOutputFormat(SequenceFileOutputFormat.class);
-
-    LOG.info("Starting collector job");
-    try {
-      JobClient.runJob(collector);
-    }
-    catch (IOException e) {
-      LOG.error(StringUtils.stringifyException(e));
-      throw e;
-    }
-    LOG.info("Finished collector job.");
-  }
-
-  /**
-   * Extracts outlinks to be created as FieldWritable objects.  Ignores empty
-   * and null anchors.
-   */
-  public static class Extractor
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, LinkDatum> {
-
-    private boolean ignoreEmptyAnchors = true;
-    private JobConf conf;
-
-    /**
-     * Default constructor.
-     */
-    public Extractor() {
-    }
-
-    /**
-     * Configurable constructor.
-     */
-    public Extractor(Configuration conf) {
-      setConf(conf);
-    }
-
-    /**
-     * Configures the job, sets to ignore empty anchors.
-     */
-    public void configure(JobConf conf) {
-      this.conf = conf;
-      ignoreEmptyAnchors = conf.getBoolean("link.ignore.empty.anchors", true);
-    }
-
-    /**
-     * Wraps values in ObjectWritable
-     */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-
-      ObjectWritable objWrite = new ObjectWritable();
-      objWrite.set(value);
-      output.collect(key, objWrite);
-    }
-
-    /**
-     * Extracts and inverts outlinks, ignores empty anchors.
-     */
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, LinkDatum> output, Reporter reporter)
-      throws IOException {
-
-      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
-      Node node = null;
-
-      // collect the outlinks while ignoring links with empty anchor text, also
-      // assign the node
-      while (values.hasNext()) {
-        ObjectWritable objWrite = values.next();
-        Object obj = objWrite.get();
-        if (obj instanceof LinkDatum) {
-          LinkDatum next = (LinkDatum)obj;
-          String anchor = next.getAnchor();
-          if (anchor != null) {
-            anchor = anchor.trim();
-          }
-          if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) {
-            continue;
-          }
-          outlinkList.add(next);
-        }
-        else if (obj instanceof Node) {
-          node = (Node)obj;
-        }
-      }
-
-      // has to have outlinks to index
-      if (node != null && outlinkList.size() > 0) {
-        String fromUrl = key.toString();
-        float outlinkScore = node.getInlinkScore();
-        for (LinkDatum datum : outlinkList) {
-          String toUrl = datum.getUrl();
-          datum.setUrl(fromUrl);
-          datum.setScore(outlinkScore);
-          datum.setLinkType(LinkDatum.INLINK);
-          output.collect(new Text(toUrl), datum);
-        }
-      }
-    }
-
-    public void close() {
-    }
-  }
-
-  /**
-   * Collects and creates FieldWritable objects from the inlinks. Inlinks are
-   * first sorted by descending score before being collected.
-   */
-  public static class Collector
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, FieldWritable> {
-
-    private int maxInlinks = 1000;
-    private boolean tokenize = true;
-    private boolean stored = false;
-    private Comparator<LinkDatum> descLinkComp = new DescendinLinkDatumScoreComparator();
-
-    /**
-     * Configures the jobs. Sets maximum number of inlinks and whether to
-     * tokenize and store.
-     */
-    public void configure(JobConf conf) {
-      this.maxInlinks = conf.getInt("link.max.inlinks", 1000);
-      this.tokenize = conf.getBoolean("indexer.anchor.tokenize", true);
-      this.stored = conf.getBoolean("indexer.anchor.stored", false);
-    }
-
-    public void close() {
-    }
-
-    /**
-     * Wraps values in ObjectWritable
-     */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-
-      ObjectWritable objWrite = new ObjectWritable();
-      objWrite.set(value);
-      output.collect(key, objWrite);
-    }
-
-    /**
-     * Aggregates and sorts inlinks. Then converts up to a max number to
-     * FieldWritable objects.
-     */
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, FieldWritable> output, Reporter reporter)
-      throws IOException {
-
-      List<LinkDatum> anchors = new ArrayList<LinkDatum>();
-      FieldsWritable basicFields = null;
-
-      // aggregate inlinks assign basic fields
-      while (values.hasNext()) {
-        ObjectWritable objWrite = values.next();
-        Object obj = objWrite.get();
-        if (obj instanceof LinkDatum) {
-          anchors.add((LinkDatum)obj);
-        }
-        else if (obj instanceof FieldsWritable) {
-          basicFields = (FieldsWritable)obj;
-        }
-      }
-
-      // only collect anchors for those urls that have basic fields, otherwise
-      // we get orphan entries indexed only under anchor text
-      if (basicFields != null && anchors.size() > 0) {
-
-        // sort according to score descending
-        Collections.sort(anchors, descLinkComp);
-
-        // collect to maximum number of inlinks
-        int numToCollect = (maxInlinks > anchors.size() ? anchors.size()
-          : maxInlinks);
-        for (int i = 0; i < numToCollect; i++) {
-          LinkDatum datum = anchors.get(i);
-          FieldWritable anchorField = new FieldWritable(Fields.ANCHOR,
-            datum.getAnchor(), FieldType.CONTENT, true, stored, tokenize);
-          output.collect(key, anchorField);
-        }
-      }
-    }
-  }
-
-  /**
-   * Creates the FieldsWritable object from the anchors.
-   * 
-   * @param webGraphDb The WebGraph from which to pull outlinks.
-   * @param basicFields The BasicFields that must be present to avoid orphan
-   * anchor fields.
-   * @param output The AnchorFields output.
-   * 
-   * @throws IOException If an error occurs while creating the fields.
-   */
-  public void createFields(Path webGraphDb, Path basicFields, Path output)
-    throws IOException {
-
-    Configuration conf = getConf();
-    FileSystem fs = FileSystem.get(conf);
-    Path tempLinks = new Path(output + "-"
-      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    runExtractor(webGraphDb, tempLinks);
-    runCollector(basicFields, tempLinks, output);
-    fs.delete(tempLinks, true);
-  }
-
-  public static void main(String[] args)
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new AnchorFields(),
-      args);
-    System.exit(res);
-  }
-
-  /**
-   * Runs the AnchorFields job.
-   */
-  public int run(String[] args)
-    throws Exception {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
-      "the output index directory").create("output");
-    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
-      "the webgraphdb to use").create("webgraphdb");
-    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs().withDescription(
-      "the basicfields to use").create("basicfields");
-    options.addOption(helpOpts);
-    options.addOption(webGraphDbOpts);
-    options.addOption(basicFieldOpts);
-    options.addOption(outputOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help") || !line.hasOption("webgraphdb")
-        || !line.hasOption("output") || !line.hasOption("basicfields")) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("AnchorFields", options);
-        return -1;
-      }
-
-      String webGraphDb = line.getOptionValue("webgraphdb");
-      String output = line.getOptionValue("output");
-      String basicFields = line.getOptionValue("basicfields");
-
-      createFields(new Path(webGraphDb), new Path(basicFields),
-        new Path(output));
-      return 0;
-    }
-    catch (Exception e) {
-      LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e));
-      return -2;
-    }
-  }
-}
Index: src/java/org/apache/nutch/indexer/field/CustomFields.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/CustomFields.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/CustomFields.java	(working copy)
@@ -1,441 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.hadoop.mapred.TextInputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/**
- * Creates custom FieldWritable objects from a text file containing field
- * information including field name, value, and optional boost and fields type
- * (as needed by FieldWritable objects).
- * 
- * An input text file to CustomFields would be tab separated and would look
- * similar to this:
- * 
- * <pre> 
- * http://www.apache.org\tlang\ten\t5.0\tCONTENT
- * http://lucene.apache.org\tlang\tde
- * </pre>
- * 
- * The only required fields are url, name and value. Custom fields are
- * configured through the custom-fields.xml file in the classpath. The config
- * file allow you to set defaults for whether a field is indexed, stored, and
- * tokenized, boosts on a field, and whether a field can output multiple values
- * under the same key.
- * 
- * The purpose of the CustomFields job is to allow better integration with
- * technologies such as Hadoop Streaming. Streaming jobs can be created in any
- * programming language, can output the text file needed by the CustomFields
- * job, and those fields can then be included in the index.
- * 
- * The concept of custom fields requires two separate pieces. The indexing piece
- * and the query piece. The indexing piece is handled by the CustomFields job.
- * The query piece is handled by the query-custom plugin.
- * 
- * <b>Important:</b><br> <i>Currently, because of the way the query plugin
- * architecture works, custom fields names must be added to the fields parameter
- * in the query-custom plugin plugin.xml file in order to be queried.</i>
- * 
- * The CustomFields tool accepts one or more directories containing text files
- * in the appropriate custom field format. These files are then turned into
- * FieldWritable objects to be included in the index.
- */
-public class CustomFields
-  extends Configured
-  implements Tool {
-
-  public static final Log LOG = LogFactory.getLog(CustomFields.class);
-
-  /**
-   * MapReduce job that converts text values into FieldWritable objects.
-   * 
-   * @param inputs The directories with text files to convert.
-   * @param output The converter output directory.
-   * 
-   * @throws IOException If an error occurs while converting.
-   */
-  private void runConverter(Path[] inputs, Path output)
-    throws IOException {
-
-    JobConf converter = new NutchJob(getConf());
-    converter.setJobName("CustomFields Converter");
-    for (int i = 0; i < inputs.length; i++) {
-      FileInputFormat.addInputPath(converter, inputs[i]);
-    }
-    FileOutputFormat.setOutputPath(converter, output);
-    converter.setInputFormat(TextInputFormat.class);
-    converter.setMapperClass(Converter.class);
-    converter.setReducerClass(Converter.class);
-    converter.setMapOutputKeyClass(Text.class);
-    converter.setMapOutputValueClass(FieldWritable.class);
-    converter.setOutputKeyClass(Text.class);
-    converter.setOutputValueClass(FieldWritable.class);
-    converter.setOutputFormat(SequenceFileOutputFormat.class);
-
-    LOG.info("Starting converter job");
-    try {
-      JobClient.runJob(converter);
-    }
-    catch (IOException e) {
-      LOG.error(StringUtils.stringifyException(e));
-      throw e;
-    }
-    LOG.info("Finished converter job.");
-  }
-
-  /**
-   * Aggregated multiple FieldWritable objects with the same name. Depending on
-   * settings in the custom-fields.xml file, a field may one or more fields.
-   * This jobs aggregates fields and then collects based on the configuration
-   * settings.
-   * 
-   * @param basicFields The basicfields FieldWritable objects.
-   * @param converted The converted custom field objects.
-   * @param output The final output directory for custom field objects.
-   * 
-   * @throws IOException If an error occurs while converting.
-   */
-  private void runCollector(Path basicFields, Path converted, Path output)
-    throws IOException {
-
-    JobConf collector = new NutchJob(getConf());
-    collector.setJobName("CustomFields Collector");
-    FileInputFormat.addInputPath(collector, converted);
-    FileInputFormat.addInputPath(collector, basicFields);
-    FileOutputFormat.setOutputPath(collector, output);
-    collector.setInputFormat(SequenceFileInputFormat.class);
-    collector.setMapOutputKeyClass(Text.class);
-    collector.setMapOutputValueClass(ObjectWritable.class);
-    collector.setMapperClass(Collector.class);
-    collector.setReducerClass(Collector.class);
-    collector.setOutputKeyClass(Text.class);
-    collector.setOutputValueClass(FieldWritable.class);
-    collector.setOutputFormat(SequenceFileOutputFormat.class);
-
-    LOG.info("Starting collector job");
-    try {
-      JobClient.runJob(collector);
-    }
-    catch (IOException e) {
-      LOG.error(StringUtils.stringifyException(e));
-      throw e;
-    }
-    LOG.info("Finished collector job.");
-  }
-
-  /**
-   * Converts text values into FieldWritable objects.
-   */
-  public static class Converter
-    extends Configured
-    implements Mapper<LongWritable, Text, Text, FieldWritable>,
-    Reducer<Text, FieldWritable, Text, FieldWritable> {
-
-    private JobConf conf;
-    private Map<String, boolean[]> flagMap = new HashMap<String, boolean[]>();
-    private Set<String> multiFields = new HashSet<String>();
-
-    public Converter() {
-    }
-
-    public Converter(Configuration conf) {
-      setConf(conf);
-    }
-
-    public void configure(JobConf conf) {
-
-      try {
-
-        // get the file system and the configuration file from the classpath
-        this.conf = conf;
-        FileSystem fs = FileSystem.get(conf);
-        String configFile = conf.get("custom.fields.config",
-          "custom-fields.xml");
-        LOG.info("Reading configuration field configuration from " + configFile);
-        Properties customFieldProps = new Properties();
-        InputStream fis = conf.getConfResourceAsInputStream(configFile);
-        if (fis == null) {
-          throw new IOException("Was unable to open " + configFile);
-        }
-
-        // load the configuration file as properties
-        customFieldProps.loadFromXML(fis);
-
-        // loop through the properties setting field flags
-        Enumeration propKeys = customFieldProps.keys();
-        while (propKeys.hasMoreElements()) {
-          String prop = (String)propKeys.nextElement();
-          if (prop.endsWith(".name")) {
-            String propName = prop.substring(0, prop.length() - 5);
-            String name = customFieldProps.getProperty(prop);
-
-            String indexedProp = customFieldProps.getProperty(propName
-              + ".indexed");
-            String storedProp = customFieldProps.getProperty(propName
-              + ".stored");
-            String tokProp = customFieldProps.getProperty(propName
-              + ".tokenized");
-            boolean indexed = (indexedProp.equalsIgnoreCase("yes")
-              || indexedProp.equalsIgnoreCase("true") || indexedProp.equalsIgnoreCase("on"));
-            boolean stored = (storedProp.equalsIgnoreCase("yes")
-              || storedProp.equalsIgnoreCase("true") || storedProp.equalsIgnoreCase("on"));
-            boolean tokenized = (tokProp.equalsIgnoreCase("yes")
-              || tokProp.equalsIgnoreCase("true") || tokProp.equalsIgnoreCase("on"));
-            boolean[] flags = {indexed, stored, tokenized};
-            flagMap.put(name, flags);
-
-            String multiProp = customFieldProps.getProperty(propName + ".multi");
-            boolean multi = (multiProp.equalsIgnoreCase("yes")
-              || multiProp.equalsIgnoreCase("true") || multiProp.equalsIgnoreCase("on"));
-            if (multi) {
-              multiFields.add(name);
-            }
-          }
-        }
-      }
-      catch (Exception e) {
-        LOG.error("Error loading custom field properties:\n"
-          + StringUtils.stringifyException(e));
-      }
-    }
-
-    public void map(LongWritable key, Text value,
-      OutputCollector<Text, FieldWritable> output, Reporter reporter)
-      throws IOException {
-
-      // split the file on tabs
-      String line = value.toString();
-      String[] fields = line.split("\t");
-      if (fields.length >= 3) {
-
-        // fields must be in a specific order, default values for optional fields
-        String url = fields[0];
-        String fieldName = fields[1];
-        String fieldVal = fields[2];
-        String fieldScore = (fields.length > 3 ? fields[3] : null);
-        String fieldType = (fields.length > 4 ? fields[4] : "CONTENT").toUpperCase();
-
-        // creates the FieldWritable objects and collects
-        boolean[] flags = flagMap.get(fieldName);
-        if (flags != null) {
-          FieldWritable field = null;
-          if (fieldScore != null) {
-            field = new FieldWritable(fieldName, fieldVal,
-              FieldType.valueOf(fieldType), Float.parseFloat(fieldScore),
-              flags[0], flags[1], flags[2]);
-          }
-          else {
-            field = new FieldWritable(fieldName, fieldVal,
-              FieldType.valueOf(fieldType), flags[0], flags[1], flags[2]);
-          }
-          output.collect(new Text(url), field);
-        }
-      }
-    }
-
-    public void reduce(Text key, Iterator<FieldWritable> values,
-      OutputCollector<Text, FieldWritable> output, Reporter reporter)
-      throws IOException {
-
-      // if multiple fields are allowed collect all of them, if not allowed
-      // and multiple fields are present all of the values are ignored
-      Set<String> multiSet = new HashSet<String>();
-      while (values.hasNext()) {
-        FieldWritable field = values.next();
-        String name = field.getName();
-        boolean isMulti = multiFields.contains(name);
-        if (isMulti || (!isMulti && !multiSet.contains(name))) {
-          output.collect(key, field);
-          multiSet.add(name);
-        }
-        else {
-          LOG.info("Ignoring multiple " + name + " fields for "
-            + key.toString());
-        }
-      }
-    }
-
-    public void close() {
-    }
-  }
-
-  /**
-   * Aggregates FieldWritable objects by the same name for the same URL.  These
-   * objects are them filtered for multiple values against configuration 
-   * settings.
-   */
-  public static class Collector
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, FieldWritable> {
-
-    private JobConf conf;
-
-    public void configure(JobConf conf) {
-      this.conf = conf;
-    }
-
-    public void close() {
-    }
-
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-
-      ObjectWritable objWrite = new ObjectWritable();
-      objWrite.set(value);
-      output.collect(key, objWrite);
-    }
-
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, FieldWritable> output, Reporter reporter)
-      throws IOException {
-
-      FieldsWritable basicFields = null;
-      List<FieldWritable> customFields = new ArrayList<FieldWritable>();
-
-      while (values.hasNext()) {
-        ObjectWritable objWrite = values.next();
-        Object obj = objWrite.get();
-        if (obj instanceof FieldWritable) {
-          customFields.add((FieldWritable)obj);
-        }
-        else if (obj instanceof FieldsWritable) {
-          basicFields = (FieldsWritable)obj;
-        }
-      }
-
-      if (basicFields != null && customFields.size() > 0) {
-        for (int i = 0; i < customFields.size(); i++) {
-          output.collect(key, customFields.get(i));
-        }
-      }
-    }
-  }
-
-  void createFields(Path basicFields, Path[] inputs, Path output)
-    throws IOException {
-
-    Configuration conf = getConf();
-    FileSystem fs = FileSystem.get(conf);
-    Path tempFields = new Path(output + "-"
-      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    runConverter(inputs, tempFields);
-    runCollector(basicFields, tempFields, output);
-    fs.delete(tempFields, true);
-  }
-
-  public static void main(String[] args)
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new CustomFields(),
-      args);
-    System.exit(res);
-  }
-
-  /**
-   * Runs the CustomFields job.
-   */
-  public int run(String[] args)
-    throws Exception {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
-      "the output index directory").create("output");
-    Option inputOpts = OptionBuilder.withArgName("input").hasArgs().withDescription(
-      "the input directories with text field files").create("input");
-    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArg().withDescription(
-      "the basicfields to use").create("basicfields");
-    options.addOption(helpOpts);
-    options.addOption(inputOpts);
-    options.addOption(basicFieldOpts);
-    options.addOption(outputOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help") || !line.hasOption("output")
-        || !line.hasOption("basicfields")) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("CustomFields", options);
-        return -1;
-      }
-
-      String[] inputs = line.getOptionValues("input");
-      Path[] inputPaths = new Path[inputs.length];
-      for (int i = 0; i < inputs.length; i++) {
-        inputPaths[i] = new Path(inputs[i]);
-      }
-      String output = line.getOptionValue("output");
-      String basicFields = line.getOptionValue("basicfields");
-
-      createFields(new Path(basicFields), inputPaths, new Path(output));
-      return 0;
-    }
-    catch (Exception e) {
-      LOG.fatal("CustomFields: " + StringUtils.stringifyException(e));
-      return -2;
-    }
-  }
-}
Index: src/java/org/apache/nutch/indexer/field/FieldIndexer.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldIndexer.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldIndexer.java	(working copy)
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.RecordWriter;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.nutch.analysis.AnalyzerFactory;
-import org.apache.nutch.analysis.NutchAnalyzer;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchSimilarity;
-import org.apache.nutch.util.LogUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-public class FieldIndexer
-  extends Configured
-  implements Tool, Mapper<Text, Writable, Text, FieldWritable>,
-  Reducer<Text, FieldWritable, Text, FieldIndexer.LuceneDocumentWrapper> {
-
-  public static final Log LOG = LogFactory.getLog(FieldIndexer.class);
-  public static final String DONE_NAME = "index.done";
-
-  private FieldFilters fieldFilters;
-
-  public static class LuceneDocumentWrapper
-    implements Writable {
-    private Document doc;
-
-    public LuceneDocumentWrapper(Document doc) {
-      this.doc = doc;
-    }
-
-    public Document get() {
-      return doc;
-    }
-
-    public void readFields(DataInput in)
-      throws IOException {
-      // intentionally left blank
-    }
-
-    public void write(DataOutput out)
-      throws IOException {
-      // intentionally left blank
-    }
-
-  }
-
-  public static class OutputFormat
-    extends FileOutputFormat<WritableComparable, LuceneDocumentWrapper> {
-
-    public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(
-      final FileSystem fs, JobConf job, String name, final Progressable progress)
-      throws IOException {
-
-      final Path perm = new Path(FileOutputFormat.getOutputPath(job), name);
-      final Path temp = job.getLocalPath("index/_"
-        + Integer.toString(new Random().nextInt()));
-
-      fs.delete(perm, true); // delete old, if any
-
-      final AnalyzerFactory factory = new AnalyzerFactory(job);
-      final IndexWriter writer = // build locally first
-      new IndexWriter(
-        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
-        new NutchDocumentAnalyzer(job), true, 
-        new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
-
-      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
-      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
-      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs",
-        Integer.MAX_VALUE));
-      writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
-      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
-      writer.setInfoStream(LogUtil.getInfoStream(LOG));
-      writer.setUseCompoundFile(false);
-      writer.setSimilarity(new NutchSimilarity());
-
-      return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
-        boolean closed;
-
-        public void write(WritableComparable key, LuceneDocumentWrapper value)
-          throws IOException { // unwrap & index doc
-          Document doc = value.get();
-          NutchAnalyzer analyzer = factory.get(doc.get("lang"));
-          if (LOG.isInfoEnabled()) {
-            LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]"
-              + " with analyzer " + analyzer);
-          }
-          writer.addDocument(doc, analyzer);
-          progress.progress();
-        }
-
-        public void close(final Reporter reporter)
-          throws IOException {
-
-          // spawn a thread to give progress heartbeats
-          Thread prog = new Thread() {
-            public void run() {
-              while (!closed) {
-                try {
-                  reporter.setStatus("closing");
-                  Thread.sleep(1000);
-                }
-                catch (InterruptedException e) {
-                  continue;
-                }
-                catch (Throwable e) {
-                  return;
-                }
-              }
-            }
-          };
-
-          try {
-            prog.start();
-            if (LOG.isInfoEnabled()) {
-              LOG.info("Optimizing index.");
-            }
-            // optimize & close index
-            writer.optimize();
-            writer.close();
-            fs.completeLocalOutput(perm, temp); // copy to dfs
-            fs.createNewFile(new Path(perm, DONE_NAME));
-          }
-          finally {
-            closed = true;
-          }
-        }
-      };
-    }
-  }
-
-  public FieldIndexer() {
-
-  }
-
-  public FieldIndexer(Configuration conf) {
-    setConf(conf);
-  }
-
-  public void configure(JobConf job) {
-    setConf(job);
-    this.fieldFilters = new FieldFilters(job);
-  }
-
-  public void close() {
-  }
-
-  public void map(Text key, Writable value,
-    OutputCollector<Text, FieldWritable> output, Reporter reporter)
-    throws IOException {
-
-    if (value instanceof FieldsWritable) {
-      FieldsWritable fields = (FieldsWritable)value;
-      List<FieldWritable> fieldsList = fields.getFieldsList();
-      for (FieldWritable field : fieldsList) {
-        output.collect(key, field);
-      }
-    }
-    else if (value instanceof FieldWritable) {
-      output.collect(key, (FieldWritable)value);
-    }
-  }
-
-  public void reduce(Text key, Iterator<FieldWritable> values,
-    OutputCollector<Text, LuceneDocumentWrapper> output, Reporter reporter)
-    throws IOException {
-
-    Document doc = new Document();
-    List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();
-    Configuration conf = getConf();
-
-    while (values.hasNext()) {
-      FieldWritable field = values.next();
-      fieldsList.add((FieldWritable)WritableUtils.clone(field, conf));
-    }
-
-    try {
-      doc = fieldFilters.filter(key.toString(), doc, fieldsList);
-    }
-    catch (IndexingException e) {
-      throw new IOException(e);
-    }
-    
-    if (doc != null) {
-      output.collect(key, new LuceneDocumentWrapper(doc));
-    }
-  }
-
-  public void index(Path[] fields, Path indexDir)
-    throws IOException {
-
-    LOG.info("FieldIndexer: starting");
-
-    JobConf job = new NutchJob(getConf());
-    job.setJobName("FieldIndexer: " + indexDir);
-
-    for (int i = 0; i < fields.length; i++) {
-      Path fieldsDb = fields[i];
-      LOG.info("FieldIndexer: adding fields db: " + fieldsDb);
-      FileInputFormat.addInputPath(job, fieldsDb);
-    }
-
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapperClass(FieldIndexer.class);
-    job.setReducerClass(FieldIndexer.class);
-    FileOutputFormat.setOutputPath(job, indexDir);
-    job.setOutputFormat(OutputFormat.class);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(FieldWritable.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(LuceneDocumentWrapper.class);
-
-    JobClient.runJob(job);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("FieldIndexer: done");
-    }
-  }
-
-  public static void main(String[] args)
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new FieldIndexer(),
-      args);
-    System.exit(res);
-  }
-
-  public int run(String[] args)
-    throws Exception {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
-      "the output index directory").create("output");
-    Option fieldOpts = OptionBuilder.withArgName("fields").hasArgs().withDescription(
-      "the field database(s) to use").create("fields");
-    options.addOption(helpOpts);
-    options.addOption(fieldOpts);
-    options.addOption(outputOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help") || !line.hasOption("fields")
-        || !line.hasOption("output")) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("FieldIndexer", options);
-        return -1;
-      }
-
-      Path output = new Path(line.getOptionValue("output"));
-      String[] fields = line.getOptionValues("fields");
-      Path[] fieldPaths = new Path[fields.length];
-      for (int i = 0; i < fields.length; i++) {
-        fieldPaths[i] = new Path(fields[i]);
-      }
-
-      index(fieldPaths, output);
-      return 0;
-    }
-    catch (Exception e) {
-      LOG.fatal("FieldIndexer: " + StringUtils.stringifyException(e));
-      return -2;
-    }
-  }
-}
Index: src/java/org/apache/nutch/indexer/field/FieldType.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldType.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldType.java	(working copy)
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-/**
- * The different types of fields. Different types of fields will be handled by
- * different FieldFilter implementations during indexing.
- */
-public enum FieldType {
-  
-  CONTENT,
-  BOOST,
-  COMPUTATION,
-  ACTION;
-  
-}
Index: src/java/org/apache/nutch/indexer/field/Fields.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/Fields.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/Fields.java	(working copy)
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-public interface Fields {
-
-  // names of common fields
-  public static final String ANCHOR = "anchor";
-  public static final String SEGMENT = "segment";
-  public static final String DIGEST = "digest";
-  public static final String HOST = "host";
-  public static final String SITE = "site";
-  public static final String URL = "url";
-  public static final String ORIG_URL = "orig";
-  public static final String SEG_URL = "segurl";
-  public static final String CONTENT = "content";
-  public static final String TITLE = "title";
-  public static final String CACHE = "cache";
-  public static final String TSTAMP = "tstamp";
-  public static final String BOOSTFACTOR = "boostfactor";
-  
-  // special fields for indexer
-  public static final String BOOST = "boost";
-  public static final String COMPUTATION = "computation";
-  public static final String ACTION = "action";
-}
Index: src/java/org/apache/nutch/indexer/field/BasicFields.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/BasicFields.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/BasicFields.java	(working copy)
@@ -1,781 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.mapred.SequenceFileOutputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.lucene.document.DateTools;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseText;
-import org.apache.nutch.scoring.webgraph.LinkDatum;
-import org.apache.nutch.scoring.webgraph.Node;
-import org.apache.nutch.scoring.webgraph.WebGraph;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.URLUtil;
-
-/**
- * Creates the basic FieldWritable objects.  The basic fields are the main 
- * fields used in indexing segments.  Many other fields jobs will rely on the
- * urls being present in the basic fields output to create their fields for 
- * indexing.
- * 
- * Basic fields are extracted from segements.  Only urls that were successfully
- * fetched and parsed will be converted.  This job also implements a portion of
- * redirect logic.  If a url contains both a redirect or orig url then both the
- * url and its orig will be measured against their link analysis score with the
- * highest scoring one being the url used for display in the index.  This 
- * ensures that we index content under the best, most popular, url which is most
- * often the one users are expecting.
- * 
- * The BasicFields tool can accept one or more segments to convert to fields.
- * If multiple segments have overlapping content, only the latest successfully
- * fetched content will be converted.
- */
-public class BasicFields
-  extends Configured
-  implements Tool {
-
-  public static final Log LOG = LogFactory.getLog(BasicFields.class);
-
-  /**
-   * Runs the Extractor job. Extracts basic fields from segments.
-   * 
-   * @param nodeDb The node database
-   * @param segment A single segment to process.
-   * @param outputDir The extractor output.
-   * 
-   * @throws IOException If an error occurs while processing the segment.
-   */
-  private void runExtractor(Path nodeDb, Path segment, Path outputDir)
-    throws IOException {
-
-    LOG.info("BasicFields: starting extractor");
-    JobConf job = new NutchJob(getConf());
-    job.setJobName("BasicFields " + outputDir);
-
-    LOG.info("BasicFields: extractor adding segment: " + segment);
-    FileInputFormat.addInputPath(job, new Path(segment,
-      CrawlDatum.FETCH_DIR_NAME));
-    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
-    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
-    FileInputFormat.addInputPath(job, nodeDb);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapperClass(Extractor.class);
-    job.setReducerClass(Extractor.class);
-    FileOutputFormat.setOutputPath(job, outputDir);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(ObjectWritable.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(FieldsWritable.class);
-
-    JobClient.runJob(job);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("BasicFields: finished extractor");
-    }
-  }
-
-  /**
-   * Runs the Flipper job. Flipper is the first of a two part job to implement
-   * redirect logic.
-   * 
-   * @param basicFields The basic fields temporary output.
-   * @param nodeDb The node database.
-   * @param outputDir The flipper output.
-   * 
-   * @throws IOException If an error occurs while processing.
-   */
-  private void runFlipper(Path basicFields, Path nodeDb, Path outputDir)
-    throws IOException {
-
-    LOG.info("BasicFields: starting flipper");
-    JobConf job = new NutchJob(getConf());
-    job.setJobName("BasicFields " + outputDir);
-    FileInputFormat.addInputPath(job, nodeDb);
-    FileInputFormat.addInputPath(job, basicFields);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapperClass(Flipper.class);
-    job.setReducerClass(Flipper.class);
-    FileOutputFormat.setOutputPath(job, outputDir);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(ObjectWritable.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(LinkDatum.class);
-
-    JobClient.runJob(job);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("BasicFields: finished flipper");
-    }
-  }
-
-  /**
-   * Runs the Scorer job. Scorer is the second of a two part job to implement
-   * redirect logic.
-   * 
-   * @param basicFields The basic fields temporary output.
-   * @param links The temporary output holding urls and any redirects.
-   * @param outputDir The scorer output.
-   * 
-   * @throws IOException If an error occurs while processing.
-   */
-  private void runScorer(Path basicFields, Path links, Path outputDir)
-    throws IOException {
-
-    LOG.info("BasicFields: starting scorer");
-    JobConf job = new NutchJob(getConf());
-    job.setJobName("BasicFields " + outputDir);
-    FileInputFormat.addInputPath(job, links);
-    FileInputFormat.addInputPath(job, basicFields);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapperClass(Scorer.class);
-    job.setReducerClass(Scorer.class);
-    FileOutputFormat.setOutputPath(job, outputDir);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(ObjectWritable.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(FieldsWritable.class);
-
-    JobClient.runJob(job);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("BasicFields: finished scorer");
-    }
-  }
-
-  /**
-   * Runs the Merger job. Merger ensures that the most recent set of fields for
-   * any given url is collected.
-   * 
-   * @param basicFields The basic fields final output.
-   * @param outputDir The merger output.
-   * 
-   * @throws IOException If an error occurs while processing.
-   */
-  private void runMerger(Path[] basicFields, Path outputDir)
-    throws IOException {
-
-    LOG.info("BasicFields: starting merger");
-    JobConf job = new NutchJob(getConf());
-    job.setJobName("BasicFields " + outputDir);
-    for (Path basic : basicFields) {
-      FileInputFormat.addInputPath(job, basic);
-    }
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setReducerClass(Merger.class);
-    FileOutputFormat.setOutputPath(job, outputDir);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(FieldsWritable.class);
-
-    JobClient.runJob(job);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("BasicFields: finished merger");
-    }
-  }
-
-  /**
-   * Extracts basic fields from a single segment.
-   */
-  private static class Extractor
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, FieldsWritable> {
-
-    private int MAX_TITLE_LENGTH;
-    private Configuration conf;
-
-    /**
-     * Default constructor.
-     */
-    public Extractor() {
-
-    }
-
-    /**
-     * Configurable constructor.
-     */
-    public Extractor(Configuration conf) {
-      setConf(conf);
-    }
-
-    /**
-     * Configures the job.
-     */
-    public void configure(JobConf conf) {
-      this.conf = conf;
-      this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
-    }
-
-    public void close() {
-    }
-
-    /**
-     * Wraps values in ObjectWritable.
-     */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-      ObjectWritable objWrite = new ObjectWritable();
-      objWrite.set(value);
-      output.collect(key, objWrite);
-    }
-
-    /**
-     * Creates basic fields from a single segment.
-     */
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
-      throws IOException {
-
-      Node nodeDb = null;
-      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
-      ParseData parseData = null;
-      ParseText parseText = null;
-      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();
-
-      // assign values, url must be successfully fetched and parsed
-      while (values.hasNext()) {
-
-        ObjectWritable objWrite = values.next();
-        Object value = objWrite.get();
-        if (value instanceof CrawlDatum) {
-          CrawlDatum datum = (CrawlDatum)value;
-          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
-            fetchDatums.add(datum);
-          }
-        }
-        else if (value instanceof Node) {
-          nodeDb = (Node)value;
-        }
-        else if (value instanceof ParseData
-          && ((ParseData)value).getStatus().isSuccess()) {
-          parseData = (ParseData)value;
-        }
-        else if (value instanceof ParseText) {
-          parseText = (ParseText)value;
-        }
-      }
-
-      // if not successfully fetched and parsed then stop processing
-      int numDatums = fetchDatums.size();
-      if (numDatums == 0 || nodeDb == null || parseText == null
-        || parseData == null) {
-        return;
-      }
-
-      // get the most recent fetch time, this is duplicates inside of a single
-      // segment, usually due to redirects
-      CrawlDatum fetchDatum = null;
-      long mostRecent = 0L;
-      for (CrawlDatum cur : fetchDatums) {
-        long fetchTime = cur.getFetchTime();
-        if (fetchDatum == null || fetchTime > mostRecent) {
-          fetchDatum = cur;
-          mostRecent = fetchTime;
-        }
-      }
-
-      // get parse metadata
-      Metadata metadata = parseData.getContentMeta();
-      Parse parse = new ParseImpl(parseText, parseData);
-
-      // handle redirect urls
-      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
-        Nutch.WRITABLE_REPR_URL_KEY);
-      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
-      String url = key.toString();
-      String fieldUrl = (reprUrl != null) ? reprUrl : url;
-      String host = URLUtil.getHost(fieldUrl);
-
-      // add segment, used to map from merged index back to segment files
-      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
-        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
-        false);
-      fieldsList.add(segField);
-
-      // add digest, used by dedup
-      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
-        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
-        false);
-      fieldsList.add(digestField);
-
-      // url is both stored and indexed, so it's both searchable and returned
-      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
-        true, true, true));
-      fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
-        false, true, false));
-
-      if (reprUrl != null) {
-        // also store original url as both stored and indexes
-        fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
-          FieldType.CONTENT, true, true, true));
-      }
-
-      if (host != null) {
-        // add host as un-stored, indexed and tokenized
-        FieldWritable hostField = new FieldWritable(Fields.HOST, host,
-          FieldType.CONTENT, true, false, true);
-        fieldsList.add(hostField);
-
-        // add site as un-stored, indexed and un-tokenized
-        FieldWritable siteField = new FieldWritable(Fields.SITE, host,
-          FieldType.CONTENT, true, false, false);
-        fieldsList.add(siteField);
-      }
-
-      // content is indexed, so that it's searchable, but not stored in index
-      fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
-        FieldType.CONTENT, true, false, true));
-
-      // title
-      String title = parse.getData().getTitle();
-      if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
-        title = title.substring(0, MAX_TITLE_LENGTH);
-      }
-      // add title indexed and stored so that it can be displayed
-      fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
-        true, true, true));
-
-      // add cached content/summary display policy, if available
-      String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
-      if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
-        fieldsList.add(new FieldWritable(Fields.CACHE, caching,
-          FieldType.CONTENT, false, true, false));
-      }
-
-      // add timestamp when fetched, for deduplication
-      fieldsList.add(new FieldWritable(Fields.TSTAMP, DateTools.timeToString(
-        fetchDatum.getFetchTime(), DateTools.Resolution.MILLISECOND),
-        FieldType.CONTENT, false, true, false));
-
-      FieldsWritable fields = new FieldsWritable();
-      fields.setFieldsList(fieldsList);
-      output.collect(key, fields);
-    }
-  }
-
-  /**
-   * Runs the first part of redirect logic.  Breaks out fields if a page
-   * contains a redirect.
-   */
-  public static class Flipper
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, LinkDatum> {
-
-    private JobConf conf;
-
-    /**
-     * Configures the job.
-     */
-    public void configure(JobConf conf) {
-      this.conf = conf;
-    }
-
-    public void close() {
-    }
-
-    /**
-     * Breaks out the collection of fields for url and redirects if necessary.
-     */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-
-      ObjectWritable objUrl = new ObjectWritable();
-      objUrl.set(key);
-
-      if (value instanceof FieldsWritable) {
-
-        // collect the fields for the url
-        FieldsWritable fields = (FieldsWritable)value;
-        FieldWritable url = fields.getField(Fields.URL);
-        FieldWritable orig = fields.getField(Fields.ORIG_URL);
-        output.collect(new Text(url.getValue()), objUrl);
-
-        // collect for the orig / redirect url if one exists
-        if (orig != null) {
-          output.collect(new Text(orig.getValue()), objUrl);
-        }
-      }
-      else {
-        
-        // anything else passes through
-        ObjectWritable objWrite = new ObjectWritable();
-        objWrite.set(value);
-        output.collect(key, objWrite);
-      }
-    }
-
-    /**
-     * Collects redirect and original links for a given url key.  This will be
-     * used in the Scorer to handle redirects.
-     */
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, LinkDatum> output, Reporter reporter)
-      throws IOException {
-
-      Node node = null;
-      List<String> urls = new ArrayList<String>();
-
-      while (values.hasNext()) {
-        ObjectWritable objWrite = values.next();
-        Object obj = objWrite.get();
-        if (obj instanceof Node) {
-          node = (Node)obj;
-        }
-        else if (obj instanceof Text) {
-          urls.add(obj.toString());
-        }
-      }
-
-      if (urls.size() > 0) {
-        float score = (node != null) ? node.getInlinkScore() : 0.0f;
-        for (String url : urls) {
-          LinkDatum datum = new LinkDatum(key.toString());
-          datum.setScore(score);
-          output.collect(new Text(url), datum);
-        }
-      }
-    }
-  }
-
-  /**
-   * The Scorer job sets the boost field from the NodeDb score.
-   * 
-   * It also runs the second part of redirect logic.  Determining the highest 
-   * scoring url for pages that contain redirects.
-   */
-  public static class Scorer
-    extends Configured
-    implements Mapper<Text, Writable, Text, ObjectWritable>,
-    Reducer<Text, ObjectWritable, Text, FieldsWritable> {
-
-    private JobConf conf;
-
-    /**
-     * Configures the job.
-     */
-    public void configure(JobConf conf) {
-      this.conf = conf;
-    }
-
-    public void close() {
-    }
-
-    /**
-     * Wraps values in ObjectWritable.
-     */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
-      throws IOException {
-
-      ObjectWritable objWrite = new ObjectWritable();
-      objWrite.set(value);
-      output.collect(key, objWrite);
-    }
-
-    /**
-     * Sets a document boost field from the NodeDb and determines the best 
-     * scoring url for pages that have rediects.  Uses the highest scoring url 
-     * as the display url in the index.
-     */
-    public void reduce(Text key, Iterator<ObjectWritable> values,
-      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
-      throws IOException {
-
-      FieldsWritable fields = null;
-      List<LinkDatum> datums = new ArrayList<LinkDatum>();
-
-      while (values.hasNext()) {
-        ObjectWritable objWrite = values.next();
-        Object obj = objWrite.get();
-        if (obj instanceof FieldsWritable) {
-          fields = (FieldsWritable)obj;
-        }
-        else if (obj instanceof LinkDatum) {
-          datums.add((LinkDatum)obj);
-        }
-      }
-
-      int numDatums = datums.size();
-      if (fields != null && numDatums > 0) {
-
-        // if no redirect for the page just assign the linkrank boost
-        List<FieldWritable> fieldsList = fields.getFieldsList();
-        if (numDatums == 1) {
-          float linkRank = datums.get(0).getScore();
-          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
-            FieldType.BOOST, linkRank));
-          output.collect(new Text(key), fields);
-        }
-        else {
-
-          // get both the url and any rediect url stored
-          FieldWritable url = fields.getField(Fields.URL);
-          FieldWritable orig = fields.getField(Fields.ORIG_URL);
-          float urlScore = 0.0f;
-          float origScore = 0.0f;
-
-          // get the scores for each
-          for (LinkDatum datum : datums) {
-            String curUrl = datum.getUrl();
-            if (curUrl.equals(url.getValue())) {
-              urlScore = datum.getScore();
-            }
-            else if (curUrl.equals(orig.getValue())) {
-              origScore = datum.getScore();
-            }
-          }
-
-          // if the highest scoring url is not the one currently displayed in 
-          // the index under the current basic fields, then switch it
-          String urlKey = url.getValue();
-          float linkRank = urlScore;
-          if (origScore > urlScore) {
-            url.setName(Fields.ORIG_URL);
-            orig.setName(Fields.URL);
-
-            // We also need to fix the host because we are changing urls
-            String host = URLUtil.getHost(orig.getValue());
-            if (host != null) {
-              fieldsList.remove(fields.getField(Fields.SITE));
-              fieldsList.remove(fields.getField(Fields.HOST));
-              fieldsList.add(new FieldWritable(Fields.HOST, host,
-                FieldType.CONTENT, true, false, true));
-              fieldsList.add(new FieldWritable(Fields.SITE, host,
-                FieldType.CONTENT, true, false, false));
-            }
-
-            linkRank = origScore;
-            urlKey = orig.getValue();
-          }
-
-          // create the final document boost field
-          fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
-            FieldType.BOOST, linkRank));
-          output.collect(new Text(urlKey), fields);
-        }
-      }
-    }
-  }
-
-  /**
-   * Merges output of all segments fields collecting only the most recent set
-   * of fields for any given url.
-   */
-  public static class Merger
-    extends Configured
-    implements Reducer<Text, FieldsWritable, Text, FieldsWritable> {
-
-    private JobConf conf;
-
-    /**
-     * Configures the job.
-     */
-    public void configure(JobConf conf) {
-      this.conf = conf;
-    }
-
-    public void close() {
-    }
-
-    /**
-     * Collects the most recent set of fields for any url.
-     */
-    public void reduce(Text key, Iterator<FieldsWritable> values,
-      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
-      throws IOException {
-
-      List<FieldsWritable> fields = new ArrayList<FieldsWritable>();
-
-      // collects the various sets of fields
-      while (values.hasNext()) {
-        fields.add((FieldsWritable)WritableUtils.clone(values.next(), conf));
-      }
-
-      // if only one set of fields for a given url passthrough
-      FieldsWritable outFields = null;
-      int numFields = fields.size();
-      if (numFields == 1) {
-        outFields = fields.get(0);
-      }
-      else if (numFields > 1) {
-
-        // more than one set of fields means url has been fetched more than 
-        // once, collect only the most recent set of fields
-        FieldsWritable mostRecent = null;
-        long recentTime = 0L;
-        for (int i = 0; i < numFields; i++) {
-          FieldsWritable cur = fields.get(i);
-          String tStampStr = cur.getField(Fields.TSTAMP).getValue();
-          long timestamp = Long.parseLong(tStampStr);
-          if (mostRecent == null || recentTime < timestamp) {
-            recentTime = timestamp;
-            mostRecent = cur;
-          }
-        }
-
-        outFields = mostRecent;
-      }
-
-      output.collect(key, outFields);
-    }
-  }
-
-  /**
-   * Runs the BasicFields jobs for every segment and aggregates and filters 
-   * the output to create a final database of FieldWritable objects.
-   * 
-   * @param nodeDb The node database.
-   * @param segments The array of segments to process.
-   * @param output The BasicFields output.
-   * 
-   * @throws IOException If an error occurs while processing the segments.
-   */
-  public void createFields(Path nodeDb, Path[] segments, Path output)
-    throws IOException {
-
-    Configuration conf = getConf();
-    FileSystem fs = FileSystem.get(conf);
-    Path tempOutput = new Path(output.toString() + "-temp");
-    fs.mkdirs(tempOutput);
-    int numSegments = segments.length;
-    Path[] basicFields = new Path[numSegments];
-
-    // one pass per segment to extract and create the basic fields
-    for (int i = 0; i < numSegments; i++) {
-
-      Path segment = segments[i];
-      Path segOutput = new Path(tempOutput, String.valueOf(i));
-      Path tempBasic = new Path(tempOutput, "basic-"
-        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-      Path tempFlip = new Path(tempOutput, "flip-"
-        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-      runExtractor(nodeDb, segment, tempBasic);
-      runFlipper(tempBasic, nodeDb, tempFlip);
-      runScorer(tempBasic, tempFlip, segOutput);
-
-      fs.delete(tempBasic, true);
-      fs.delete(tempFlip, true);
-      basicFields[i] = segOutput;
-    }
-
-    // merge all of the segments and delete any temporary output
-    runMerger(basicFields, output);
-    fs.delete(tempOutput, true);
-  }
-
-  public static void main(String[] args)
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new BasicFields(),
-      args);
-    System.exit(res);
-  }
-
-  /**
-   * Runs the BasicFields tool.
-   */
-  public int run(String[] args)
-    throws Exception {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
-      "the output index directory").create("output");
-    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
-      "the webgraphdb to use").create("webgraphdb");
-    Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(
-      "the segment(s) to use").create("segment");
-    options.addOption(helpOpts);
-    options.addOption(webGraphOpts);
-    options.addOption(segOpts);
-    options.addOption(outputOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help") || !line.hasOption("webgraphdb")
-        || !line.hasOption("output") || !line.hasOption("segment")) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("BasicFields", options);
-        return -1;
-      }
-
-      // get the command line options and all of the segments
-      String webGraphDb = line.getOptionValue("webgraphdb");
-      String output = line.getOptionValue("output");
-      String[] segments = line.getOptionValues("segment");
-      Path[] segPaths = new Path[segments.length];
-      for (int i = 0; i < segments.length; i++) {
-        segPaths[i] = new Path(segments[i]);
-      }
-
-      createFields(new Path(webGraphDb, WebGraph.NODE_DIR), segPaths, new Path(
-        output));
-      return 0;
-    }
-    catch (Exception e) {
-      LOG.fatal("BasicFields: " + StringUtils.stringifyException(e));
-      return -2;
-    }
-  }
-}
Index: src/java/org/apache/nutch/indexer/field/FieldFilter.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldFilter.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldFilter.java	(working copy)
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.util.List;
-
-import org.apache.hadoop.conf.Configurable;
-import org.apache.lucene.document.Document;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.plugin.Pluggable;
-
-/**
- * Filter to manipulate FieldWritable objects for a given url during indexing.
- * 
- * Field filters are responsible for converting FieldWritable objects into 
- * lucene fields and adding those fields to the Lucene document.
- */
-public interface FieldFilter
-  extends Pluggable, Configurable {
-
-  final static String X_POINT_ID = FieldFilter.class.getName();
-
-  /**
-   * Returns the document to which fields are being added or null if we are to
-   * stop processing for this url and not add anything to the index.  All 
-   * FieldWritable objects for a url are aggregated from databases passed into
-   * the FieldIndexer and these fields are then passed into the Field filters.
-   * 
-   * It is therefore possible for fields to be added, removed, and changed 
-   * before being indexed.
-   * 
-   * @param url The url to index.  
-   * @param doc The lucene document
-   * @param fields The list of FieldWritable objects representing fields for 
-   * the index.
-   * @return The lucene Document or null to stop processing and not index any
-   * content for this url.
-   * 
-   * @throws IndexingException If an error occurs during indexing
-   */
-  public Document filter(String url, Document doc, List<FieldWritable> fields)
-    throws IndexingException;
-
-}
Index: src/java/org/apache/nutch/indexer/field/FieldFilters.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldFilters.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldFilters.java	(working copy)
@@ -1,133 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.document.Document;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.ObjectCache;
-
-/**
- * The FieldFilters class provides a standard way to collect, order, and run
- * all FieldFilter implementations that are active in the plugin system.
- */
-public class FieldFilters {
-
-  public static final String FIELD_FILTER_ORDER = "field.filter.order";
-
-  public final static Log LOG = LogFactory.getLog(FieldFilters.class);
-
-  private FieldFilter[] fieldFilters;
-
-  /**
-   * Configurable constructor.
-   */
-  public FieldFilters(Configuration conf) {
-
-    // get the field filter order, the cache, and all field filters
-    String order = conf.get(FIELD_FILTER_ORDER);
-    ObjectCache objectCache = ObjectCache.get(conf);
-    this.fieldFilters = (FieldFilter[])objectCache.getObject(FieldFilter.class.getName());
-    
-    if (this.fieldFilters == null) {
-
-      String[] orderedFilters = null;
-      if (order != null && !order.trim().equals("")) {
-        orderedFilters = order.split("\\s+");
-      }
-      try {
-
-        // get the field filter extension point
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-          FieldFilter.X_POINT_ID);
-        if (point == null) {
-          throw new RuntimeException(FieldFilter.X_POINT_ID + " not found.");
-        }
-        
-        // get all of the field filter plugins
-        Extension[] extensions = point.getExtensions();
-        HashMap<String, FieldFilter> filterMap = new HashMap<String, FieldFilter>();
-        for (int i = 0; i < extensions.length; i++) {
-          Extension extension = extensions[i];
-          FieldFilter filter = (FieldFilter)extension.getExtensionInstance();
-          LOG.info("Adding " + filter.getClass().getName());
-          if (!filterMap.containsKey(filter.getClass().getName())) {
-            filterMap.put(filter.getClass().getName(), filter);
-          }
-        }
-
-        // order the filters if necessary
-        if (orderedFilters == null) {
-          objectCache.setObject(FieldFilter.class.getName(),
-            filterMap.values().toArray(new FieldFilter[0]));
-        }
-        else {
-          ArrayList<FieldFilter> filters = new ArrayList<FieldFilter>();
-          for (int i = 0; i < orderedFilters.length; i++) {
-            FieldFilter filter = filterMap.get(orderedFilters[i]);
-            if (filter != null) {
-              filters.add(filter);
-            }
-          }
-          objectCache.setObject(FieldFilter.class.getName(),
-            filters.toArray(new FieldFilter[filters.size()]));
-        }
-      }
-      catch (PluginRuntimeException e) {
-        throw new RuntimeException(e);
-      }
-      
-      // set the filters in the cache
-      this.fieldFilters = (FieldFilter[])objectCache.getObject(FieldFilter.class.getName());
-    }
-  }
-
-  /**
-   * Runs all FieldFilter extensions.
-   * 
-   * @param url The url to index.
-   * @param doc The lucene index document.
-   * @param fields The lucene fields.
-   * 
-   * @return The document to filter or null to not index this document and url.
-   * 
-   * @throws IndexingException If an error occurs while running filters.
-   */
-  public Document filter(String url, Document doc, List<FieldWritable> fields)
-    throws IndexingException {
-
-    // loop through and run the field filters
-    for (int i = 0; i < this.fieldFilters.length; i++) {
-      doc = this.fieldFilters[i].filter(url, doc, fields);
-      if (doc == null) {
-        return null;
-      }
-    }
-    return doc;
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/field/FieldWritable.java
===================================================================
--- src/java/org/apache/nutch/indexer/field/FieldWritable.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/field/FieldWritable.java	(working copy)
@@ -1,145 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-/** 
- * A class that holds a single field of content to be placed into an index.
- * 
- * This class has options type of content as well as for how the field is to 
- * be indexed.
- */
-public class FieldWritable
-  implements Writable {
-
-  private String name;
-  private String value;
-  private FieldType type = FieldType.CONTENT;
-  private float boost;
-  private boolean indexed = true;
-  private boolean stored = false;
-  private boolean tokenized = true;
-
-  public FieldWritable() {
-
-  }
-
-  public FieldWritable(String name, String value, FieldType type, float boost) {
-    this(name, value, type, boost, true, false, true);
-  }
-
-  public FieldWritable(String name, String value, FieldType type,
-    boolean indexed, boolean stored, boolean tokenized) {
-    this(name, value, type, 0.0f, indexed, stored, tokenized);
-  }
-
-  public FieldWritable(String name, String value, FieldType type, float boost,
-    boolean indexed, boolean stored, boolean tokenized) {
-    this.name = name;
-    this.value = value;
-    this.type = type;
-    this.boost = boost;
-    this.indexed = indexed;
-    this.stored = stored;
-    this.tokenized = tokenized;
-  }
-
-  public String getName() {
-    return name;
-  }
-
-  public void setName(String name) {
-    this.name = name;
-  }
-
-  public String getValue() {
-    return value;
-  }
-
-  public void setValue(String value) {
-    this.value = value;
-  }
-
-  public FieldType getType() {
-    return type;
-  }
-
-  public void setType(FieldType type) {
-    this.type = type;
-  }
-
-  public float getBoost() {
-    return boost;
-  }
-
-  public void setBoost(float boost) {
-    this.boost = boost;
-  }
-
-  public boolean isIndexed() {
-    return indexed;
-  }
-
-  public void setIndexed(boolean indexed) {
-    this.indexed = indexed;
-  }
-
-  public boolean isStored() {
-    return stored;
-  }
-
-  public void setStored(boolean stored) {
-    this.stored = stored;
-  }
-
-  public boolean isTokenized() {
-    return tokenized;
-  }
-
-  public void setTokenized(boolean tokenized) {
-    this.tokenized = tokenized;
-  }
-
-  public void readFields(DataInput in)
-    throws IOException {
-    name = Text.readString(in);
-    value = Text.readString(in);
-    type = FieldType.valueOf(Text.readString(in));
-    boost = in.readFloat();
-    indexed = in.readBoolean();
-    stored = in.readBoolean();
-    tokenized = in.readBoolean();
-  }
-
-  public void write(DataOutput out)
-    throws IOException {
-    Text.writeString(out, name);
-    Text.writeString(out, value);
-    Text.writeString(out, type.toString());
-    out.writeFloat(boost);
-    out.writeBoolean(indexed);
-    out.writeBoolean(stored);
-    out.writeBoolean(tokenized);
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/IndexSorter.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexSorter.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/IndexSorter.java	(working copy)
@@ -1,340 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Date;
-import java.util.Arrays;
-
-import org.apache.lucene.index.*;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.document.*;
-import org.apache.lucene.store.*;
-import org.apache.lucene.search.*;
-
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.util.*;
-
-/** Sort a Nutch index by page score.  Higher scoring documents are assigned
- * smaller document numbers. */
-public class IndexSorter extends Configured implements Tool {
-  private static final Log LOG = LogFactory.getLog(IndexSorter.class);
-  
-  private static class PostingMap implements Comparable<PostingMap> {
-    private int newDoc;
-    private long offset;
-
-    public int compareTo(PostingMap pm) {              // order by newDoc id
-      return this.newDoc - pm.newDoc;
-    }
-  }
-
-  private static class SortedTermPositions implements TermPositions {
-    private TermPositions original;
-    private int[] oldToNew;
-
-    private int docFreq;
-
-    private PostingMap[] postingMaps = new PostingMap[0];
-    private int pointer;
-
-    private int freq;
-    private int position;
-
-    private static final String TEMP_FILE = "temp";
-    private final RAMDirectory tempDir = new RAMDirectory();
-    private RAMOutputStream out;
-    private IndexInput in;
-
-    public SortedTermPositions(TermPositions original, int[] oldToNew) {
-      this.original = original;
-      this.oldToNew = oldToNew;
-      try {
-        out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
-      } catch (IOException ioe) {
-        LOG.warn("Error creating temporary output: " + StringUtils.stringifyException(ioe));
-      }
-    }
-
-    public void seek(Term term) throws IOException {
-      throw new UnsupportedOperationException();
-    }
-
-    public void seek(TermEnum terms) throws IOException {
-      original.seek(terms);
-
-      docFreq = terms.docFreq();
-      pointer = -1;
-
-      if (docFreq > postingMaps.length) {         // grow postingsMap
-        PostingMap[] newMap = new PostingMap[docFreq];
-        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
-        for (int i = postingMaps.length; i < docFreq; i++) {
-          newMap[i] = new PostingMap();
-        }
-        postingMaps = newMap;
-      }
-
-      out.reset();
-
-      int i = 0;
-      while (original.next()) {
-        PostingMap map = postingMaps[i++];
-        map.newDoc = oldToNew[original.doc()];    // remap the newDoc id
-        map.offset = out.getFilePointer();        // save pointer to buffer
-
-        final int tf = original.freq();           // buffer tf & positions
-        out.writeVInt(tf);
-        int prevPosition = 0;
-        for (int j = tf; j > 0; j--) {            // delta encode positions
-          int p = original.nextPosition();
-          out.writeVInt(p - prevPosition);
-          prevPosition = p;
-        }
-      }
-      out.flush();
-      docFreq = i;                                // allow for deletions
-      
-      Arrays.sort(postingMaps, 0, docFreq);       // resort by mapped doc ids
-
-      // NOTE: this might be substantially faster if RAMInputStream were public
-      // and supported a reset() operation.
-      in = tempDir.openInput(TEMP_FILE);
-    }
-        
-    public boolean next() throws IOException {
-      pointer++;
-      if (pointer < docFreq) {
-        in.seek(postingMaps[pointer].offset);
-        freq = in.readVInt();
-        position = 0;
-        return true;
-      }
-      return false;
-    }
-      
-    public int doc() { return postingMaps[pointer].newDoc; }
-    public int freq() { return freq; }
-
-    public int nextPosition() throws IOException {
-      int positionIncrement = in.readVInt();
-      position += positionIncrement;
-      return position;
-    }
-
-    public int read(int[] docs, int[] freqs) {
-      throw new UnsupportedOperationException();
-    }
-    public boolean skipTo(int target) {
-      throw new UnsupportedOperationException();
-    }
-
-    public byte[] getPayload(byte[] data, int offset) throws IOException {
-      return null;
-    }
-
-    public int getPayloadLength() {
-      return 0;
-    }
-
-    public boolean isPayloadAvailable() {
-      return false;
-    }
-
-    public void close() throws IOException {
-      original.close();
-    }
-
-  }
-
-  private static class SortingReader extends FilterIndexReader {
-    
-    private int[] oldToNew;
-    private int[] newToOld;
-
-    public SortingReader(IndexReader oldReader, int[] oldToNew) {
-      super(oldReader);
-      this.oldToNew = oldToNew;
-      
-      this.newToOld = new int[oldReader.maxDoc()];
-      int oldDoc = 0;
-      while (oldDoc < oldToNew.length) {
-        int newDoc = oldToNew[oldDoc];
-        if (newDoc != -1) {
-          newToOld[newDoc] = oldDoc;
-        }
-        oldDoc++;
-      }
-    }
-
-    public Document document(int n) throws IOException {
-      return document(n, null);
-    }
-
-    public Document document(int n, FieldSelector fieldSelector)
-        throws CorruptIndexException, IOException {
-      return super.document(newToOld[n], fieldSelector);
-    }
-
-    public boolean isDeleted(int n) {
-      return false;
-    }
-
-    public byte[] norms(String f) throws IOException {
-      throw new UnsupportedOperationException();
-    }
-
-    public void norms(String f, byte[] norms, int offset) throws IOException {
-      byte[] oldNorms = super.norms(f);
-      int oldDoc = 0;
-      while (oldDoc < oldNorms.length) {
-        int newDoc = oldToNew[oldDoc];
-        if (newDoc != -1) {
-          norms[newDoc] = oldNorms[oldDoc];
-        }
-        oldDoc++;
-      }
-    }
-
-    protected void doSetNorm(int d, String f, byte b) throws IOException {
-      throw new UnsupportedOperationException();
-    }
-
-    public TermDocs termDocs() throws IOException {
-      throw new UnsupportedOperationException();
-    }
-    
-    public TermPositions termPositions() throws IOException {
-      return new SortedTermPositions(super.termPositions(), oldToNew);
-    }
-
-    protected void doDelete(int n) throws IOException { 
-      throw new UnsupportedOperationException();
-    }
-
-  }
-
-  private static class DocScore implements Comparable<DocScore> {
-    private int oldDoc;
-    private float score;
-
-    public int compareTo(DocScore that) {            // order by score, oldDoc
-      if (this.score == that.score) {
-        return this.oldDoc - that.oldDoc;
-      } else {
-        return this.score < that.score ? 1 : -1 ;
-      }
-    }
-    
-    public String toString() {
-      return "oldDoc=" + oldDoc + ",score=" + score;
-    }
-  }
-
-  public IndexSorter() {
-    
-  }
-  
-  public IndexSorter(Configuration conf) {
-    setConf(conf);
-  }
-  
-  public void sort(File directory) throws IOException {
-    LOG.info("IndexSorter: starting.");
-    Date start = new Date();
-    int termIndexInterval = getConf().getInt("indexer.termIndexInterval", 128);
-    IndexReader reader = IndexReader.open(
-    		FSDirectory.open(new File(directory, "index")));
-
-    SortingReader sorter = new SortingReader(reader, oldToNew(reader));
-    IndexWriter writer = new IndexWriter(
-    		FSDirectory.open(new File(directory, "index-sorted")),
-    			null, true, MaxFieldLength.UNLIMITED);
-    writer.setTermIndexInterval
-      (termIndexInterval);
-    writer.setUseCompoundFile(false);
-    writer.addIndexes(new IndexReader[] { sorter });
-    writer.close();
-    Date end = new Date();
-    LOG.info("IndexSorter: done, " + (end.getTime() - start.getTime())
-        + " total milliseconds");
-  }
-
-  private static int[] oldToNew(IndexReader reader) throws IOException {
-    int readerMax = reader.maxDoc();
-    DocScore[] newToOld = new DocScore[readerMax];
-
-    // use site, an indexed, un-tokenized field to get boost
-    byte[] boosts = reader.norms("site");          
-
-    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
-      float score;
-      if (reader.isDeleted(oldDoc)) {
-        score = 0.0f;
-      } else {
-        score = Similarity.decodeNorm(boosts[oldDoc]);
-      }
-      DocScore docScore = new DocScore();
-      docScore.oldDoc = oldDoc;
-      docScore.score = score;
-      newToOld[oldDoc] = docScore;
-    }
-    Arrays.sort(newToOld);
-
-    int[] oldToNew = new int[readerMax];
-    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
-      DocScore docScore = newToOld[newDoc];
-      oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1;
-    }    
-    return oldToNew;
-  }
-
-  /** */
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new IndexSorter(), args);
-    System.exit(res);
-  }
-  
-  public int run(String[] args) throws Exception {
-    File directory;
-      
-    String usage = "IndexSorter directory";
-
-    if (args.length < 1) {
-      System.err.println("Usage: " + usage);
-      return -1;
-    }
-
-    directory = new File(args[0]);
-
-    try {
-      sort(directory);
-      return 0;
-    } catch (Exception e) {
-      LOG.fatal("IndexSorter: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/HighFreqTerms.java
===================================================================
--- src/java/org/apache/nutch/indexer/HighFreqTerms.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/HighFreqTerms.java	(working copy)
@@ -1,103 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermEnum;
-
-import java.io.File;
-import java.io.OutputStreamWriter;
-
-/** Lists the most frequent terms in an index. */
-public class HighFreqTerms {
-  public static int numTerms = 100;
-
-  private static class TermFreq {
-    TermFreq(Term t, int df) {
-      term = t;
-      docFreq = df;
-    }
-    int docFreq;
-    Term term;
-  }
-
-  private static class TermFreqQueue extends PriorityQueue<TermFreq> {
-    TermFreqQueue(int size) {
-      initialize(size);
-    }
-
-    protected final boolean lessThan(TermFreq termInfoA, TermFreq termInfoB) {
-      return termInfoA.docFreq < termInfoB.docFreq;
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    IndexReader reader = null;
-    boolean noFreqs = false;
-    int count = 100;
-    String usage = "HighFreqTerms [-count <n>] [-nofreqs] <index dir>";
-
-    if (args.length == 0) {
-      System.err.println(usage);
-      System.exit(-1);
-    }
-
-    for (int i = 0; i < args.length; i++) {       // parse command line
-      if (args[i].equals("-count")) {		  // found -count option
-        count = Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-nofreqs")) {    // found -nofreqs option
-        noFreqs = true;
-      } else {
-        reader = IndexReader.open(FSDirectory.open(new File(args[i])));
-      }
-    }
-
-    TermFreqQueue tiq = new TermFreqQueue(count);
-    TermEnum terms = reader.terms();
-      
-    int minFreq = 0;
-    while (terms.next()) {
-      if (terms.docFreq() > minFreq) {
-        TermFreq top = tiq.add(new TermFreq(terms.term(), terms.docFreq()));
-        if (tiq.size() >= count) {                 // if tiq overfull
-          tiq.pop();                              // remove lowest in tiq
-          minFreq = top.docFreq; // reset minFreq
-        }
-      }
-    }
-
-    OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
-    while (tiq.size() != 0) {
-      TermFreq termInfo = (TermFreq)tiq.pop();
-      out.write(termInfo.term.toString());
-      if (!noFreqs) {
-        out.write(" ");
-        out.write(Integer.toString(termInfo.docFreq));
-      }
-      out.write("\n");
-    }
-
-    out.flush();
-    reader.close();
-  }
-
-}
-
Index: src/java/org/apache/nutch/indexer/IndexMerger.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexMerger.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/IndexMerger.java	(working copy)
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.*;
-import java.util.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.mapred.FileAlreadyExistsException;
-import org.apache.hadoop.util.*;
-import org.apache.hadoop.conf.*;
-
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.nutch.util.LogUtil;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.LogMergePolicy;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-
-/*************************************************************************
- * IndexMerger creates an index for the output corresponding to a 
- * single fetcher run.
- * 
- * @author Doug Cutting
- * @author Mike Cafarella
- *************************************************************************/
-public class IndexMerger extends Configured implements Tool {
-  public static final Log LOG = LogFactory.getLog(IndexMerger.class);
-
-  public static final String DONE_NAME = "merge.done";
-
-  public IndexMerger() {
-    
-  }
-  
-  public IndexMerger(Configuration conf) {
-    setConf(conf);
-  }
-  
-  /**
-   * Merge all input indexes to the single output index
-   */
-  public void merge(Path[] indexes, Path outputIndex, Path localWorkingDir) throws IOException {
-    LOG.info("merging indexes to: " + outputIndex);
-
-    FileSystem localFs = FileSystem.getLocal(getConf());  
-    if (localFs.exists(localWorkingDir)) {
-      localFs.delete(localWorkingDir, true);
-    }
-    localFs.mkdirs(localWorkingDir);
-
-    // Get local output target
-    //
-    FileSystem fs = FileSystem.get(getConf());
-    if (fs.exists(outputIndex)) {
-      throw new FileAlreadyExistsException("Output directory " + outputIndex + " already exists!");
-    }
-
-    Path tmpLocalOutput = new Path(localWorkingDir, "merge-output");
-    Path localOutput = fs.startLocalOutput(outputIndex, tmpLocalOutput);
-
-    Directory[] dirs = new Directory[indexes.length];
-    for (int i = 0; i < indexes.length; i++) {
-      if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); }
-      dirs[i] = new FsDirectory(fs, indexes[i], false, getConf());
-    }
-
-    //
-    // Merge indices
-    //
-    IndexWriter writer = new IndexWriter(
-    		FSDirectory.open(new File(localOutput.toString())), null, true,
-    				MaxFieldLength.UNLIMITED);
-    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", LogMergePolicy.DEFAULT_MERGE_FACTOR));
-    writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
-    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", LogMergePolicy.DEFAULT_MAX_MERGE_DOCS));
-    writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
-    writer.setInfoStream(LogUtil.getDebugStream(LOG));
-    writer.setUseCompoundFile(false);
-    writer.setSimilarity(new NutchSimilarity());
-    writer.addIndexesNoOptimize(dirs);
-    writer.optimize();
-    writer.close();
-
-    //
-    // Put target back
-    //
-    fs.completeLocalOutput(outputIndex, tmpLocalOutput);
-    LOG.info("done merging");
-  }
-
-  /** 
-   * Create an index for the input files in the named directory. 
-   */
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args);
-    System.exit(res);
-  }
-  
-  public int run(String[] args) throws Exception {
-    String usage = "IndexMerger [-workingdir <workingdir>] outputIndex indexesDir...";
-    if (args.length < 2) {
-      System.err.println("Usage: " + usage);
-      return -1;
-    }
-
-    //
-    // Parse args, read all index directories to be processed
-    //
-    FileSystem fs = FileSystem.get(getConf());
-    List<Path> indexDirs = new ArrayList<Path>();
-
-    Path workDir = new Path("indexmerger-" + System.currentTimeMillis());  
-    int i = 0;
-    if ("-workingdir".equals(args[i])) {
-      i++;
-      workDir = new Path(args[i++], "indexmerger-" + System.currentTimeMillis());
-    }
-
-    Path outputIndex = new Path(args[i++]);
-
-    for (; i < args.length; i++) {
-      FileStatus[] fstats = fs.listStatus(new Path(args[i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
-      indexDirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(fstats)));
-    }
-
-    //
-    // Merge the indices
-    //
-
-    Path[] indexFiles = (Path[])indexDirs.toArray(new Path[indexDirs.size()]);
-
-    try {
-      merge(indexFiles, outputIndex, workDir);
-      return 0;
-    } catch (Exception e) {
-      LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e));
-      return -1;
-    } finally {
-      FileSystem.getLocal(getConf()).delete(workDir, true);
-    }
-  }
-}
Index: src/java/org/apache/nutch/indexer/IndexingFilter.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilter.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/IndexingFilter.java	(working copy)
@@ -52,13 +52,4 @@
    */
   NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException;
-
-  /** Adds index-level configuraition options.
-   * Implementations can update given configuration to pass document-independent
-   * information to indexing backends. As a rule of thumb, prefix meta keys
-   * with the name of the backend intended. For example, when
-   * passing information to lucene backend, prefix keys with "lucene.".
-   * @param conf Configuration instance.
-   * */
-  public void addIndexBackendOptions(Configuration conf);
 }
Index: src/java/org/apache/nutch/indexer/DeleteDuplicates.java
===================================================================
--- src/java/org/apache/nutch/indexer/DeleteDuplicates.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/DeleteDuplicates.java	(working copy)
@@ -1,523 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.*;
-import java.text.SimpleDateFormat;
-import java.util.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.*;
-
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-
-/**
- * Delete duplicate documents in a set of Lucene indexes.
- * Duplicates have either the same contents (via MD5 hash) or the same URL.
- * 
- * This tool uses the following algorithm:
- * 
- * <ul>
- * <li><b>Phase 1 - remove URL duplicates:</b><br/>
- * In this phase documents with the same URL
- * are compared, and only the most recent document is retained -
- * all other URL duplicates are scheduled for deletion.</li>
- * <li><b>Phase 2 - remove content duplicates:</b><br/>
- * In this phase documents with the same content hash are compared. If
- * property "dedup.keep.highest.score" is set to true (default) then only
- * the document with the highest score is retained. If this property is set
- * to false, only the document with the shortest URL is retained - all other
- * content duplicates are scheduled for deletion.</li>
- * <li><b>Phase 3 - delete documents:</b><br/>
- * In this phase documents scheduled for deletion are marked as deleted in
- * Lucene index(es).</li>
- * </ul>
- * 
- * @author Andrzej Bialecki
- */
-public class DeleteDuplicates extends Configured
-  implements Tool, Mapper<WritableComparable, Writable, Text, IntWritable>, Reducer<Text, IntWritable, WritableComparable, Writable>, OutputFormat<WritableComparable, Writable> {
-  private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
-
-//   Algorithm:
-//      
-//   1. map indexes -> <url, <md5, url, time, urlLen, index,doc>>
-//      reduce, deleting all but most recent
-//
-//   2. map indexes -> <md5, <md5, url, time, urlLen, index,doc>>
-//      partition by md5
-//      reduce, deleting all but with highest score (or shortest url).
-
-  public static class IndexDoc implements WritableComparable {
-    private Text url = new Text();
-    private int urlLen;
-    private float score;
-    private long time;
-    private MD5Hash hash = new MD5Hash();
-    private Text index = new Text();              // the segment index
-    private int doc;                              // within the index
-    private boolean keep = true;                  // keep or discard
-
-    public String toString() {
-      return "[url=" + url + ",score=" + score + ",time=" + time
-        + ",hash=" + hash + ",index=" + index + ",doc=" + doc
-        + ",keep=" + keep + "]";
-    }
-    
-    public void write(DataOutput out) throws IOException {
-      url.write(out);
-      out.writeFloat(score);
-      out.writeLong(time);
-      hash.write(out);
-      index.write(out);
-      out.writeInt(doc);
-      out.writeBoolean(keep);
-    }
-
-    public void readFields(DataInput in) throws IOException {
-      url.readFields(in);
-      urlLen = url.getLength();
-      score = in.readFloat();
-      time = in.readLong();
-      hash.readFields(in);
-      index.readFields(in);
-      doc = in.readInt();
-      keep = in.readBoolean();
-    }
-
-    public int compareTo(Object o) {
-      IndexDoc that = (IndexDoc)o;
-      if (this.keep != that.keep) {
-        return this.keep ? 1 : -1; 
-      } else if (!this.hash.equals(that.hash)) {       // order first by hash
-        return this.hash.compareTo(that.hash);
-      } else if (this.time != that.time) {      // prefer more recent docs
-        return this.time > that.time ? 1 : -1 ;
-      } else if (this.urlLen != that.urlLen) {  // prefer shorter urls
-        return this.urlLen - that.urlLen;
-      } else {
-        return this.score > that.score ? 1 : -1;
-      }
-    }
-
-    public boolean equals(Object o) {
-      IndexDoc that = (IndexDoc)o;
-      return this.keep == that.keep
-        && this.hash.equals(that.hash)
-        && this.time == that.time
-        && this.score == that.score
-        && this.urlLen == that.urlLen
-        && this.index.equals(that.index) 
-        && this.doc == that.doc;
-    }
-
-  }
-
-  public static class InputFormat extends FileInputFormat<Text, IndexDoc> {
-    private static final long INDEX_LENGTH = Integer.MAX_VALUE;
-
-    /** Return each index as a split. */
-    public InputSplit[] getSplits(JobConf job, int numSplits)
-      throws IOException {
-      FileStatus[] files = listStatus(job);
-      InputSplit[] splits = new InputSplit[files.length];
-      for (int i = 0; i < files.length; i++) {
-        FileStatus cur = files[i];
-        splits[i] = new FileSplit(cur.getPath(), 0, INDEX_LENGTH, (String[])null);
-      }
-      return splits;
-    }
-
-    public class DDRecordReader implements RecordReader<Text, IndexDoc> {
-
-      private IndexReader indexReader;
-      private int maxDoc = 0;
-      private int doc = 0;
-      private Text index;
-      
-      public DDRecordReader(FileSplit split, JobConf job,
-          Text index) throws IOException {
-        try {
-          indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), split.getPath(), false, job));
-          maxDoc = indexReader.maxDoc();
-        } catch (IOException ioe) {
-          LOG.warn("Can't open index at " + split + ", skipping. (" + ioe.getMessage() + ")");
-          indexReader = null;
-        }
-        this.index = index;
-      }
-
-      public boolean next(Text key, IndexDoc indexDoc)
-        throws IOException {
-        
-        // skip empty indexes
-        if (indexReader == null || maxDoc <= 0)
-          return false;
-
-        // skip deleted documents
-        while (doc < maxDoc && indexReader.isDeleted(doc)) doc++;
-        if (doc >= maxDoc)
-          return false;
-
-        Document document = indexReader.document(doc);
-
-        // fill in key
-        key.set(document.get("url"));
-        // fill in value
-        indexDoc.keep = true;
-        indexDoc.url.set(document.get("url"));
-        indexDoc.hash.setDigest(document.get("digest"));
-        indexDoc.score = Float.parseFloat(document.get("boost"));
-        try {
-          indexDoc.time = DateTools.stringToTime(document.get("tstamp"));
-        } catch (Exception e) {
-          // try to figure out the time from segment name
-          try {
-            String segname = document.get("segment");
-            indexDoc.time = new SimpleDateFormat("yyyyMMddHHmmss").parse(segname).getTime();
-            // make it unique
-            indexDoc.time += doc;
-          } catch (Exception e1) {
-            // use current time
-            indexDoc.time = System.currentTimeMillis();
-          }
-        }
-        indexDoc.index = index;
-        indexDoc.doc = doc;
-
-        doc++;
-
-        return true;
-      }
-
-      public long getPos() throws IOException {
-        return maxDoc == 0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
-      }
-
-      public void close() throws IOException {
-        if (indexReader != null) indexReader.close();
-      }
-      
-      public Text createKey() {
-        return new Text();
-      }
-      
-      public IndexDoc createValue() {
-        return new IndexDoc();
-      }
-
-      public float getProgress() throws IOException {
-        return maxDoc == 0 ? 0.0f : (float)doc / (float)maxDoc;
-      }
-    }
-    
-    /** Return each index as a split. */
-    public RecordReader<Text, IndexDoc> getRecordReader(InputSplit split,
-                                        JobConf job,
-                                        Reporter reporter) throws IOException {
-      FileSplit fsplit = (FileSplit)split;
-      Text index = new Text(fsplit.getPath().toString());
-      reporter.setStatus(index.toString());
-      return new DDRecordReader(fsplit, job, index);
-    }
-  }
-  
-  public static class HashPartitioner implements Partitioner<MD5Hash, Writable> {
-    public void configure(JobConf job) {}
-    public void close() {}
-    public int getPartition(MD5Hash key, Writable value,
-                            int numReduceTasks) {
-      int hashCode = key.hashCode();
-      return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
-    }
-  }
-
-  public static class UrlsReducer implements Reducer<Text, IndexDoc, MD5Hash, IndexDoc> {
-    
-    public void configure(JobConf job) {}
-    
-    public void close() {}
-    
-    private IndexDoc latest = new IndexDoc();
-    
-    public void reduce(Text key, Iterator<IndexDoc> values,
-        OutputCollector<MD5Hash, IndexDoc> output, Reporter reporter) throws IOException {
-      WritableUtils.cloneInto(latest, values.next());
-      while (values.hasNext()) {
-        IndexDoc value = values.next();
-        if (value.time > latest.time) {
-          // discard current and use more recent
-          latest.keep = false;
-          LOG.debug("-discard " + latest + ", keep " + value);
-          output.collect(latest.hash, latest);
-          WritableUtils.cloneInto(latest, value);
-        } else {
-          // discard
-          value.keep = false;
-          LOG.debug("-discard " + value + ", keep " + latest);
-          output.collect(value.hash, value);
-        }
-        
-      }
-      // keep the latest
-      latest.keep = true;
-      output.collect(latest.hash, latest);
-      
-    }
-  }
-  
-  public static class HashReducer implements Reducer<MD5Hash, IndexDoc, Text, IndexDoc> {
-    boolean byScore;
-    
-    public void configure(JobConf job) {
-      byScore = job.getBoolean("dedup.keep.highest.score", true);
-    }
-    
-    public void close() {}
-    
-    private IndexDoc highest = new IndexDoc();
-    
-    public void reduce(MD5Hash key, Iterator<IndexDoc> values,
-                       OutputCollector<Text, IndexDoc> output, Reporter reporter)
-      throws IOException {
-      boolean highestSet = false;
-      while (values.hasNext()) {
-        IndexDoc value = values.next();
-        // skip already deleted
-        if (!value.keep) {
-          LOG.debug("-discard " + value + " (already marked)");
-          output.collect(value.url, value);
-          continue;
-        }
-        if (!highestSet) {
-          WritableUtils.cloneInto(highest, value);
-          highestSet = true;
-          continue;
-        }
-        IndexDoc toDelete = null, toKeep = null;
-        boolean metric = byScore ? (value.score > highest.score) : 
-                                   (value.urlLen < highest.urlLen);
-        if (metric) {
-          toDelete = highest;
-          toKeep = value;
-        } else {
-          toDelete = value;
-          toKeep = highest;
-        }
-        
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("-discard " + toDelete + ", keep " + toKeep);
-        }
-        
-        toDelete.keep = false;
-        output.collect(toDelete.url, toDelete);
-        WritableUtils.cloneInto(highest, toKeep);
-      }    
-      LOG.debug("-keep " + highest);
-      // no need to add this - in phase 2 we only process docs to delete them
-      // highest.keep = true;
-      // output.collect(key, highest);
-    }
-  }
-    
-  private FileSystem fs;
-
-  public void configure(JobConf job) {
-    setConf(job);
-  }
-  
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-    try {
-      if(conf != null) fs = FileSystem.get(conf);
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  public void close() {}
-
-  /** Map [*,IndexDoc] pairs to [index,doc] pairs. */
-  public void map(WritableComparable key, Writable value,
-                  OutputCollector<Text, IntWritable> output, Reporter reporter)
-    throws IOException {
-    IndexDoc indexDoc = (IndexDoc)value;
-    // don't delete these
-    if (indexDoc.keep) return;
-    // delete all others
-    output.collect(indexDoc.index, new IntWritable(indexDoc.doc));
-  }
-
-  /** Delete docs named in values from index named in key. */
-  public void reduce(Text key, Iterator<IntWritable> values,
-                     OutputCollector<WritableComparable, Writable> output, Reporter reporter)
-    throws IOException {
-    Path index = new Path(key.toString());
-    IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()), false);
-    try {
-      while (values.hasNext()) {
-        IntWritable value = values.next();
-        LOG.debug("-delete " + index + " doc=" + value);
-        reader.deleteDocument(value.get());
-      }
-    } finally {
-      reader.close();
-    }
-  }
-
-  /** Write nothing. */
-  public RecordWriter<WritableComparable, Writable> getRecordWriter(final FileSystem fs,
-                                      final JobConf job,
-                                      final String name,
-                                      final Progressable progress) throws IOException {
-    return new RecordWriter<WritableComparable, Writable>() {                   
-        public void write(WritableComparable key, Writable value)
-          throws IOException {
-          throw new UnsupportedOperationException();
-        }        
-        public void close(Reporter reporter) throws IOException {}
-      };
-  }
-
-  public DeleteDuplicates() {
-    
-  }
-  
-  public DeleteDuplicates(Configuration conf) {
-    setConf(conf);
-  }
-  
-  public void checkOutputSpecs(FileSystem fs, JobConf job) {}
-
-  public void dedup(Path[] indexDirs)
-    throws IOException {
-
-    if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }
-
-    Path outDir1 =
-      new Path("dedup-urls-"+
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-    JobConf job = new NutchJob(getConf());
-
-    for (int i = 0; i < indexDirs.length; i++) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
-      }
-      FileInputFormat.addInputPath(job, indexDirs[i]);
-    }
-    job.setJobName("dedup 1: urls by time");
-
-    job.setInputFormat(InputFormat.class);
-    job.setMapOutputKeyClass(Text.class);
-    job.setMapOutputValueClass(IndexDoc.class);
-
-    job.setReducerClass(UrlsReducer.class);
-    FileOutputFormat.setOutputPath(job, outDir1);
-
-    job.setOutputKeyClass(MD5Hash.class);
-    job.setOutputValueClass(IndexDoc.class);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-
-    JobClient.runJob(job);
-
-    Path outDir2 =
-      new Path("dedup-hash-"+
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    job = new NutchJob(getConf());
-    job.setJobName("dedup 2: content by hash");
-
-    FileInputFormat.addInputPath(job, outDir1);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapOutputKeyClass(MD5Hash.class);
-    job.setMapOutputValueClass(IndexDoc.class);
-    job.setPartitionerClass(HashPartitioner.class);
-    job.setSpeculativeExecution(false);
-    
-    job.setReducerClass(HashReducer.class);
-    FileOutputFormat.setOutputPath(job, outDir2);
-
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(IndexDoc.class);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-
-    JobClient.runJob(job);
-
-    // remove outDir1 - no longer needed
-    fs.delete(outDir1, true);
-    
-    job = new NutchJob(getConf());
-    job.setJobName("dedup 3: delete from index(es)");
-
-    FileInputFormat.addInputPath(job, outDir2);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    //job.setInputKeyClass(Text.class);
-    //job.setInputValueClass(IndexDoc.class);
-
-    job.setInt("io.file.buffer.size", 4096);
-    job.setMapperClass(DeleteDuplicates.class);
-    job.setReducerClass(DeleteDuplicates.class);
-
-    job.setOutputFormat(DeleteDuplicates.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(IntWritable.class);
-
-    JobClient.runJob(job);
-
-    fs.delete(outDir2, true);
-
-    if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); }
-  }
-
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new DeleteDuplicates(), args);
-    System.exit(res);
-  }
-  
-  public int run(String[] args) throws Exception {
-    
-    if (args.length < 1) {
-      System.err.println("Usage: DeleteDuplicates <indexes> ...");
-      return -1;
-    }
-    
-    Path[] indexes = new Path[args.length];
-    for (int i = 0; i < args.length; i++) {
-      indexes[i] = new Path(args[i]);
-    }
-    try {
-      dedup(indexes);
-      return 0;
-    } catch (Exception e) {
-      LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/NutchDocument.java
===================================================================
--- src/java/org/apache/nutch/indexer/NutchDocument.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/NutchDocument.java	(working copy)
@@ -19,11 +19,9 @@
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 
@@ -35,51 +33,48 @@
 
 /** A {@link NutchDocument} is the unit of indexing.*/
 public class NutchDocument
-implements Writable, Iterable<Entry<String, List<String>>> {
+implements Writable, Iterable<Entry<String, NutchField>> {
 
-  public static final byte VERSION = 1;
+  public static final byte VERSION = 2;
+  
+  private Map<String, NutchField> fields;
 
-  private Map<String, List<String>> fields;
-
   private Metadata documentMeta;
 
-  private float score;
+  private float weight;
 
   public NutchDocument() {
-    fields = new HashMap<String, List<String>>();
+    fields = new HashMap<String, NutchField>();
     documentMeta = new Metadata();
-    score = 0.0f;
+    weight = 1.0f;
   }
 
-  public void add(String name, String value) {
-    List<String> fieldValues = fields.get(name);
-    if (fieldValues == null) {
-      fieldValues = new ArrayList<String>();
+  public void add(String name, Object value) {
+    NutchField field = fields.get(name);
+    if (field == null) {
+      field = new NutchField(value);
+      fields.put(name, field);
+    } else {
+      field.add(value);
     }
-    fieldValues.add(value);
-    fields.put(name, fieldValues);
   }
 
-  private void addFieldUnprotected(String name, String value) {
-    fields.get(name).add(value);
-  }
-
-  public String getFieldValue(String name) {
-    List<String> fieldValues = fields.get(name);
-    if (fieldValues == null) {
+  public Object getFieldValue(String name) {
+    NutchField field = fields.get(name);
+    if (field == null) {
       return null;
     }
-    if (fieldValues.size() == 0) {
+    if (field.getValues().size() == 0) {
       return null;
     }
-    return fieldValues.get(0);
+    return field.getValues().get(0);
   }
 
-  public List<String> getFieldValues(String name) {
+  public NutchField getField(String name) {
     return fields.get(name);
   }
 
-  public List<String> removeField(String name) {
+  public NutchField removeField(String name) {
     return fields.remove(name);
   }
 
@@ -88,16 +83,16 @@
   }
 
   /** Iterate over all fields. */
-  public Iterator<Entry<String, List<String>>> iterator() {
+  public Iterator<Entry<String, NutchField>> iterator() {
     return fields.entrySet().iterator();
   }
 
-  public float getScore() {
-    return score;
+  public float getWeight() {
+    return weight;
   }
 
-  public void setScore(float score) {
-    this.score = score;
+  public void setWeight(float weight) {
+    this.weight = weight;
   }
 
   public Metadata getDocumentMeta() {
@@ -105,6 +100,7 @@
   }
 
   public void readFields(DataInput in) throws IOException {
+    fields.clear();
     byte version = in.readByte();
     if (version != VERSION) {
       throw new VersionMismatchException(VERSION, version);
@@ -112,30 +108,23 @@
     int size = WritableUtils.readVInt(in);
     for (int i = 0; i < size; i++) {
       String name = Text.readString(in);
-      int numValues = WritableUtils.readVInt(in);
-      fields.put(name, new ArrayList<String>());
-      for (int j = 0; j < numValues; j++) {
-        String value = Text.readString(in);
-        addFieldUnprotected(name, value);
-      }
+      NutchField field = new NutchField();
+      field.readFields(in);
+      fields.put(name, field);
     }
-    score = in.readFloat();
+    weight = in.readFloat();
     documentMeta.readFields(in);
   }
 
   public void write(DataOutput out) throws IOException {
     out.writeByte(VERSION);
     WritableUtils.writeVInt(out, fields.size());
-    for (Map.Entry<String, List<String>> entry : fields.entrySet()) {
+    for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
       Text.writeString(out, entry.getKey());
-      List<String> values = entry.getValue();
-      WritableUtils.writeVInt(out, values.size());
-      for (String value : values) {
-        Text.writeString(out, value);
-      }
+      NutchField field = entry.getValue();
+      field.write(out);
     }
-    out.writeFloat(score);
+    out.writeFloat(weight);
     documentMeta.write(out);
   }
-
 }
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(working copy)
@@ -152,7 +152,7 @@
       return;
     }
     // apply boost to all indexed fields.
-    doc.setScore(boost);
+    doc.setWeight(boost);
     // store boost for use by explain and dedup
     doc.add("boost", Float.toString(boost));
 
Index: src/java/org/apache/nutch/indexer/NutchField.java
===================================================================
--- src/java/org/apache/nutch/indexer/NutchField.java	(revision 0)
+++ src/java/org/apache/nutch/indexer/NutchField.java	(revision 0)
@@ -0,0 +1,64 @@
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a multi-valued field with a weight. Values are arbitrary
+ * objects.
+ */
+public class NutchField implements Writable {
+  private float weight;
+  private List<Object> values = new ArrayList<Object>();
+  
+  public NutchField() {
+    
+  }
+  
+  public NutchField(Object value) {
+    this(value, 1.0f);
+  }
+  
+  public NutchField(Object value, float weight) {
+    this.weight = weight;
+    if (value instanceof Collection) {
+      values.addAll((Collection<Object>)value);
+    } else {
+      values.add(value);
+    }
+  }
+  
+  public void add(Object value) {
+    values.add(value);
+  }
+  
+  public float getWeight() {
+    return weight;
+  }
+
+  public void setWeight(float weight) {
+    this.weight = weight;
+  }
+
+  public List<Object> getValues() {
+    return values;
+  }
+  
+  public void reset() {
+    weight = 1.0f;
+    values.clear();
+  }
+
+  public void readFields(DataInput in) throws IOException {
+  }
+
+  public void write(DataOutput out) throws IOException {
+  }
+
+}

Property changes on: src/java/org/apache/nutch/indexer/NutchField.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: src/java/org/apache/nutch/indexer/NutchSimilarity.java
===================================================================
--- src/java/org/apache/nutch/indexer/NutchSimilarity.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/NutchSimilarity.java	(working copy)
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer;
-
-import org.apache.lucene.search.DefaultSimilarity;
-
-/** Similarity implementatation used by Nutch indexing and search. */
-@SuppressWarnings("serial")
-public class NutchSimilarity extends DefaultSimilarity  {
-  private static final int MIN_CONTENT_LENGTH = 1000;
-
-  /** Normalize field by length.  Called at index time. */
-  public float lengthNorm(String fieldName, int numTokens) {
-    if ("url".equals(fieldName)) {                // URL: prefer short
-      return 1.0f / numTokens;                    // use linear normalization
-      
-    } else if ("anchor".equals(fieldName)) {      // Anchor: prefer more
-      return (float)(1.0/Math.log(Math.E+numTokens)); // use log
-
-    } else if ("content".equals(fieldName)) {     // Content: penalize short
-      return super.lengthNorm(fieldName,          // treat short as longer
-                              Math.max(numTokens, MIN_CONTENT_LENGTH));
-
-    } else {                                      // use default
-      return super.lengthNorm(fieldName, numTokens);
-    }
-  }
-
-  public float coord(int overlap, int maxOverlap) {
-    return 1.0f;
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
===================================================================
--- src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java	(working copy)
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.lucene;
-
-public interface LuceneConstants {
-  public static final String LUCENE_PREFIX = "lucene.";
-
-  public static final String FIELD_PREFIX = LUCENE_PREFIX + "field.";
-
-  public static final String FIELD_STORE_PREFIX = FIELD_PREFIX + "store.";
-
-  public static final String FIELD_INDEX_PREFIX = FIELD_PREFIX + "index.";
-
-  public static final String FIELD_VECTOR_PREFIX = FIELD_PREFIX + "vector.";
-
-  public static final String STORE_YES = "store.yes";
-
-  public static final String STORE_NO = "store.no";
-
-  public static final String STORE_COMPRESS = "store.compress";
-
-  public static final String INDEX_NO = "index.no";
-
-  // TODO: -> ANALYZED_NO_NORMS
-  public static final String INDEX_NO_NORMS = "index.no_norms";
-
-  // TODO: -> ANALYZED
-  public static final String INDEX_TOKENIZED = "index.tokenized";
-
-  // TODO: -> NOT_ANALYZED
-  public static final String INDEX_UNTOKENIZED = "index.untokenized";
-
-  public static final String VECTOR_NO = "vector.no";
-
-  public static final String VECTOR_POS = "vector.pos";
-
-  public static final String VECTOR_OFFSET = "vector.offset";
-
-  public static final String VECTOR_POS_OFFSET = "vector.pos_offset";
-
-  public static final String VECTOR_YES = "vector.yes";
-
-}
Index: src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
===================================================================
--- src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java	(working copy)
@@ -1,312 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.lucene;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriter.MaxFieldLength;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.nutch.analysis.AnalyzerFactory;
-import org.apache.nutch.analysis.NutchAnalyzer;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.indexer.Indexer;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchIndexWriter;
-import org.apache.nutch.indexer.NutchSimilarity;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.LogUtil;
-
-public class LuceneWriter implements NutchIndexWriter {
-
-  public static enum STORE { YES, NO, COMPRESS }
-
-  public static enum INDEX { NO, NO_NORMS, TOKENIZED, UNTOKENIZED }
-
-  public static enum VECTOR { NO, OFFSET, POS, POS_OFFSET, YES }
-
-  private IndexWriter writer;
-
-  private AnalyzerFactory analyzerFactory;
-
-  private Path perm;
-
-  private Path temp;
-
-  private FileSystem fs;
-
-  private final Map<String, Field.Store> fieldStore;
-
-  private final Map<String, Field.Index> fieldIndex;
-
-  private final Map<String, Field.TermVector> fieldVector;
-
-  public LuceneWriter() {
-    fieldStore = new HashMap<String, Field.Store>();
-    fieldIndex = new HashMap<String, Field.Index>();
-    fieldVector = new HashMap<String, Field.TermVector>();
-  }
-
-  private Document createLuceneDoc(NutchDocument doc) {
-    final Document out = new Document();
-
-    out.setBoost(doc.getScore());
-
-    final Metadata documentMeta = doc.getDocumentMeta();
-    for (final Entry<String, List<String>> entry : doc) {
-      final String fieldName = entry.getKey();
-
-      Field.Store store = fieldStore.get(fieldName);
-      Field.Index index = fieldIndex.get(fieldName);
-      Field.TermVector vector = fieldVector.get(fieldName);
-
-      // default values
-      if (store == null) {
-        store = Field.Store.NO;
-      }
-
-      if (index == null) {
-        index = Field.Index.NO;
-      }
-
-      if (vector == null) {
-        vector = Field.TermVector.NO;
-      }
-
-      // read document-level field information
-      final String[] fieldMetas =
-        documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
-      if (fieldMetas.length != 0) {
-        for (final String val : fieldMetas) {
-          if (LuceneConstants.STORE_YES.equals(val)) {
-            store = Field.Store.YES;
-          } else if (LuceneConstants.STORE_NO.equals(val)) {
-            store = Field.Store.NO;
-          } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) {
-            index = Field.Index.ANALYZED;
-          } else if (LuceneConstants.INDEX_NO.equals(val)) {
-            index = Field.Index.NO;
-          } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) {
-            index = Field.Index.NOT_ANALYZED;
-          } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) {
-            index = Field.Index.ANALYZED_NO_NORMS;
-          } else if (LuceneConstants.VECTOR_NO.equals(val)) {
-            vector = Field.TermVector.NO;
-          } else if (LuceneConstants.VECTOR_YES.equals(val)) {
-            vector = Field.TermVector.YES;
-          } else if (LuceneConstants.VECTOR_POS.equals(val)) {
-            vector = Field.TermVector.WITH_POSITIONS;
-          } else if (LuceneConstants.VECTOR_POS_OFFSET.equals(val)) {
-            vector = Field.TermVector.WITH_POSITIONS_OFFSETS;
-          } else if (LuceneConstants.VECTOR_OFFSET.equals(val)) {
-            vector = Field.TermVector.WITH_OFFSETS;
-          }
-        }
-      }
-
-      for (final String fieldValue : entry.getValue()) {
-        out.add(new Field(fieldName, fieldValue, store, index, vector));
-      }
-    }
-
-    return out;
-  }
-
-  @SuppressWarnings("unchecked")
-  private void processOptions(Configuration conf) {
-    final Iterator iterator = conf.iterator();
-    while (iterator.hasNext()) {
-      final String key = (String) ((Map.Entry)iterator.next()).getKey();
-      if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) {
-        continue;
-      }
-      if (key.startsWith(LuceneConstants.FIELD_STORE_PREFIX)) {
-        final String field =
-          key.substring(LuceneConstants.FIELD_STORE_PREFIX.length());
-        final LuceneWriter.STORE store = LuceneWriter.STORE.valueOf(conf.get(key));
-        switch (store) {
-        case YES:
-        case COMPRESS:
-          fieldStore.put(field, Field.Store.YES);
-          break;
-        case NO:
-          fieldStore.put(field, Field.Store.NO);
-          break;
-        }
-      } else if (key.startsWith(LuceneConstants.FIELD_INDEX_PREFIX)) {
-        final String field =
-          key.substring(LuceneConstants.FIELD_INDEX_PREFIX.length());
-        final LuceneWriter.INDEX index = LuceneWriter.INDEX.valueOf(conf.get(key));
-        switch (index) {
-        case NO:
-          fieldIndex.put(field, Field.Index.NO);
-          break;
-        case NO_NORMS:
-          fieldIndex.put(field, Field.Index.NOT_ANALYZED_NO_NORMS);
-          break;
-        case TOKENIZED:
-          fieldIndex.put(field, Field.Index.ANALYZED);
-          break;
-        case UNTOKENIZED:
-          fieldIndex.put(field, Field.Index.NOT_ANALYZED);
-          break;
-        }
-      } else if (key.startsWith(LuceneConstants.FIELD_VECTOR_PREFIX)) {
-        final String field =
-          key.substring(LuceneConstants.FIELD_VECTOR_PREFIX.length());
-        final LuceneWriter.VECTOR vector = LuceneWriter.VECTOR.valueOf(conf.get(key));
-        switch (vector) {
-        case NO:
-          fieldVector.put(field, Field.TermVector.NO);
-          break;
-        case OFFSET:
-          fieldVector.put(field, Field.TermVector.WITH_OFFSETS);
-          break;
-        case POS:
-          fieldVector.put(field, Field.TermVector.WITH_POSITIONS);
-          break;
-        case POS_OFFSET:
-          fieldVector.put(field, Field.TermVector.WITH_POSITIONS_OFFSETS);
-          break;
-        case YES:
-          fieldVector.put(field, Field.TermVector.YES);
-          break;
-        }
-      }
-    }
-  }
-
-  public void open(JobConf job, String name)
-  throws IOException {
-    this.fs = FileSystem.get(job);
-    perm = new Path(FileOutputFormat.getOutputPath(job), name);
-    temp = job.getLocalPath("index/_"  +
-                      Integer.toString(new Random().nextInt()));
-
-    fs.delete(perm, true); // delete old, if any
-    analyzerFactory = new AnalyzerFactory(job);
-    writer = new IndexWriter(
-        FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
-        new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);
-
-    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
-    writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
-    writer.setMaxMergeDocs(job
-        .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
-    writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
-    writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
-    writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG));
-    writer.setUseCompoundFile(false);
-    writer.setSimilarity(new NutchSimilarity());
-
-    processOptions(job);
-  }
-
-  public void close() throws IOException {
-    writer.optimize();
-    writer.close();
-    fs.completeLocalOutput(perm, temp); // copy to dfs
-    fs.createNewFile(new Path(perm, Indexer.DONE_NAME));
-  }
-
-  public void write(NutchDocument doc) throws IOException {
-    final Document luceneDoc = createLuceneDoc(doc);
-    final NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang"));
-    if (Indexer.LOG.isDebugEnabled()) {
-      Indexer.LOG.debug("Indexing [" + luceneDoc.get("url")
-          + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang")
-          + ")");
-    }
-    writer.addDocument(luceneDoc, analyzer);
-
-  }
-
-  /** Adds a lucene field.
-   * <p>
-   * This method is provided for backward-compatibility with
-   * older indexing filters. This should not be used by newer
-   * implementations since this is slower than
-   * {@link NutchDocument#add(String, String)} and will be removed
-   * in a future release.
-   * </p>
-   * @param f Lucene field to be added.
-   * @deprecated Use {@link NutchDocument#add(String, String)} instead and
-   * set index-level metadata for field information.
-   * */
-  @Deprecated
-  public static void add(NutchDocument doc, Field f) {
-    final String fieldName = f.name();
-    final String key = LuceneConstants.FIELD_PREFIX + fieldName;
-    final Metadata documentMeta = doc.getDocumentMeta();
-    if (f.isStored()) {
-      documentMeta.add(key, LuceneConstants.STORE_YES);
-    } else {
-      documentMeta.add(key, LuceneConstants.STORE_NO);
-    }
-
-    if (f.isIndexed()) {
-      if (f.isTokenized()) {
-        documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
-      } else if (f.getOmitNorms()) {
-        documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS);
-      } else {
-        documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
-      }
-    } else {
-      documentMeta.add(key, LuceneConstants.INDEX_NO);
-    }
-
-    if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) {
-      documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET);
-    } else if (f.isStoreOffsetWithTermVector()) {
-      documentMeta.add(key, LuceneConstants.VECTOR_OFFSET);
-    } else if (f.isStorePositionWithTermVector()) {
-      documentMeta.add(key, LuceneConstants.VECTOR_POS);
-    } else if (f.isTermVectorStored()) {
-      documentMeta.add(key, LuceneConstants.VECTOR_YES);
-    } else {
-      documentMeta.add(key, LuceneConstants.VECTOR_NO);
-    }
-  }
-
-  public static void addFieldOptions(String field, LuceneWriter.STORE store,
-      LuceneWriter.INDEX index, LuceneWriter.VECTOR vector, Configuration conf) {
-
-    conf.set(LuceneConstants.FIELD_STORE_PREFIX + field, store.toString());
-    conf.set(LuceneConstants.FIELD_INDEX_PREFIX + field, index.toString());
-    conf.set(LuceneConstants.FIELD_VECTOR_PREFIX + field, vector.toString());
-  }
-
-  public static void addFieldOptions(String field, LuceneWriter.STORE store,
-      LuceneWriter.INDEX index, Configuration conf) {
-    LuceneWriter.addFieldOptions(field, store, index, LuceneWriter.VECTOR.NO, conf);
-  }
-}
Index: src/java/org/apache/nutch/indexer/FsDirectory.java
===================================================================
--- src/java/org/apache/nutch/indexer/FsDirectory.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/FsDirectory.java	(working copy)
@@ -1,255 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.*;
-import java.util.Random;
-
-import org.apache.lucene.store.*;
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.Configuration;
-
-/** Reads a Lucene index stored in DFS. */
-public class FsDirectory extends Directory {
-
-  private FileSystem fs;
-  private Path directory;
-  private int ioFileBufferSize;
-
-  public FsDirectory(FileSystem fs, Path directory, boolean create, Configuration conf)
-    throws IOException {
-
-    this.fs = fs;
-    this.directory = directory;
-    this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096);
-    
-    if (create) {
-      create();
-    }
-
-    if (!fs.getFileStatus(directory).isDir())
-      throw new IOException(directory + " not a directory");
-  }
-
-  private void create() throws IOException {
-    if (!fs.exists(directory)) {
-      fs.mkdirs(directory);
-    }
-
-    if (!fs.getFileStatus(directory).isDir())
-      throw new IOException(directory + " not a directory");
-
-    // clear old files
-    FileStatus[] fstats = fs.listStatus(directory, HadoopFSUtil.getPassAllFilter());
-    Path[] files = HadoopFSUtil.getPaths(fstats);
-    for (int i = 0; i < files.length; i++) {
-      if (!fs.delete(files[i], false))
-        throw new IOException("Cannot delete " + files[i]);
-    }
-  }
-
-  public String[] listAll() throws IOException {
-    FileStatus[] fstats = fs.listStatus(directory, HadoopFSUtil.getPassAllFilter());
-    Path[] files = HadoopFSUtil.getPaths(fstats);
-    if (files == null) return null;
-
-    String[] result = new String[files.length];
-    for (int i = 0; i < files.length; i++) {
-      result[i] = files[i].getName();
-    }
-    return result;
-  }
-
-  public boolean fileExists(String name) throws IOException {
-    return fs.exists(new Path(directory, name));
-  }
-
-  public long fileModified(String name) {
-    throw new UnsupportedOperationException();
-  }
-
-  public void touchFile(String name) {
-    throw new UnsupportedOperationException();
-  }
-
-  public long fileLength(String name) throws IOException {
-    return fs.getFileStatus(new Path(directory, name)).getLen();
-  }
-
-  public void deleteFile(String name) throws IOException {
-    if (!fs.delete(new Path(directory, name), false))
-      throw new IOException("Cannot delete " + name);
-  }
-
-  public void renameFile(String from, String to) throws IOException {
-    // DFS is currently broken when target already exists,
-    // so we explicitly delete the target first.
-    Path target = new Path(directory, to);
-    if (fs.exists(target)) {
-      fs.delete(target, false);
-    }
-    fs.rename(new Path(directory, from), target);
-  }
-
-  public IndexOutput createOutput(String name) throws IOException {
-    Path file = new Path(directory, name);
-    if (fs.exists(file) && !fs.delete(file, false))      // delete existing, if any
-      throw new IOException("Cannot overwrite: " + file);
-
-    return new DfsIndexOutput(file, this.ioFileBufferSize);
-  }
-
-
-  public IndexInput openInput(String name) throws IOException {
-    return new DfsIndexInput(new Path(directory, name), this.ioFileBufferSize);
-  }
-
-  public Lock makeLock(final String name) {
-    return new Lock() {
-      public boolean obtain() {
-        return true;
-      }
-      public void release() {
-      }
-      public boolean isLocked() {
-        throw new UnsupportedOperationException();
-      }
-      public String toString() {
-        return "Lock@" + new Path(directory, name);
-      }
-    };
-  }
-
-  public synchronized void close() throws IOException {
-    fs.close();
-  }
-
-  public String toString() {
-    return this.getClass().getName() + "@" + directory;
-  }
-
-
-  private class DfsIndexInput extends BufferedIndexInput {
-
-    /** Shared by clones. */
-    private class Descriptor {
-      public FSDataInputStream in;
-      public long position;                       // cache of in.getPos()
-      public Descriptor(Path file, int ioFileBufferSize) throws IOException {
-        this.in = fs.open(file);
-      }
-    }
-
-    private final Descriptor descriptor;
-    private final long length;
-    private boolean isClone;
-
-    public DfsIndexInput(Path path, int ioFileBufferSize) throws IOException {
-      descriptor = new Descriptor(path,ioFileBufferSize);
-      length = fs.getFileStatus(path).getLen();
-    }
-
-    protected void readInternal(byte[] b, int offset, int len)
-      throws IOException {
-      synchronized (descriptor) {
-        long position = getFilePointer();
-        if (position != descriptor.position) {
-          descriptor.in.seek(position);
-          descriptor.position = position;
-        }
-        int total = 0;
-        do {
-          int i = descriptor.in.read(b, offset+total, len-total);
-          if (i == -1)
-            throw new IOException("read past EOF");
-          descriptor.position += i;
-          total += i;
-        } while (total < len);
-      }
-    }
-
-    public void close() throws IOException {
-      if (!isClone) {
-        descriptor.in.close();
-      }
-    }
-
-    protected void seekInternal(long position) {} // handled in readInternal()
-
-    public long length() {
-      return length;
-    }
-
-    protected void finalize() throws IOException {
-      close();                                      // close the file
-    }
-
-    public Object clone() {
-      DfsIndexInput clone = (DfsIndexInput)super.clone();
-      clone.isClone = true;
-      return clone;
-    }
-  }
-
-  private class DfsIndexOutput extends BufferedIndexOutput {
-    private FSDataOutputStream out;
-    private RandomAccessFile local;
-    private File localFile;
-
-    public DfsIndexOutput(Path path, int ioFileBufferSize) throws IOException {
-      
-      // create a temporary local file and set it to delete on exit
-      String randStr = Integer.toString(new Random().nextInt(Integer.MAX_VALUE));
-      localFile = File.createTempFile("index_" + randStr, ".tmp");
-      localFile.deleteOnExit();
-      local = new RandomAccessFile(localFile, "rw");
-
-      out = fs.create(path);
-    }
-
-    public void flushBuffer(byte[] b, int offset, int size) throws IOException {
-      local.write(b, offset, size);
-    }
-
-    public void close() throws IOException {
-      super.close();
-      
-      // transfer to dfs from local
-      byte[] buffer = new byte[4096];
-      local.seek(0);
-      int read = -1;
-      while ((read = local.read(buffer)) != -1) {
-        out.write(buffer, 0, read);
-      }
-      out.close();
-      local.close();
-    }
-
-    public void seek(long pos) throws IOException {
-      super.seek(pos);
-      local.seek(pos);
-    }
-
-    public long length() throws IOException {
-      return local.length();
-    }
-
-  }
-
-}
Index: src/java/org/apache/nutch/indexer/IndexingFilters.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilters.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/IndexingFilters.java	(working copy)
@@ -70,7 +70,6 @@
               .getExtensionInstance();
           LOG.info("Adding " + filter.getClass().getName());
           if (!filterMap.containsKey(filter.getClass().getName())) {
-            filter.addIndexBackendOptions(conf);
             filterMap.put(filter.getClass().getName(), filter);
           }
         }
@@ -89,7 +88,6 @@
             IndexingFilter filter = filterMap
                 .get(orderedFilters[i]);
             if (filter != null) {
-              filter.addIndexBackendOptions(conf);
               filters.add(filter);
             }
           }
Index: src/java/org/apache/nutch/indexer/Indexer.java
===================================================================
--- src/java/org/apache/nutch/indexer/Indexer.java	(revision 959954)
+++ src/java/org/apache/nutch/indexer/Indexer.java	(working copy)
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/** Create indexes for segments. */
-public class Indexer extends Configured implements Tool {
-
-  public static final String DONE_NAME = "index.done";
-
-  public static final Log LOG = LogFactory.getLog(Indexer.class);
-
-  public Indexer() {
-    super(null);
-  }
-
-  public Indexer(Configuration conf) {
-    super(conf);
-  }
-
-  public void index(Path luceneDir, Path crawlDb,
-                    Path linkDb, List<Path> segments)
-  throws IOException {
-    LOG.info("Indexer: starting");
-
-    final JobConf job = new NutchJob(getConf());
-    job.setJobName("index-lucene " + luceneDir);
-
-    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
-
-    FileOutputFormat.setOutputPath(job, luceneDir);
-
-    LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
-    LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
-    LuceneWriter.addFieldOptions("boost", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
-
-    NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class);
-
-    JobClient.runJob(job);
-    LOG.info("Indexer: done");
-  }
-
-  public int run(String[] args) throws Exception {
-    if (args.length < 4) {
-      System.err.println("Usage: Indexer <index> <crawldb> <linkdb> <segment> ...");
-      return -1;
-    }
-
-    final Path luceneDir = new Path(args[0]);
-    final Path crawlDb = new Path(args[1]);
-    final Path linkDb = new Path(args[2]);
-
-    final List<Path> segments = new ArrayList<Path>();
-    for (int i = 3; i < args.length; i++) {
-      segments.add(new Path(args[i]));
-    }
-
-    try {
-      index(luceneDir, crawlDb, linkDb, segments);
-      return 0;
-    } catch (final Exception e) {
-      LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
-
-  public static void main(String[] args) throws Exception {
-    final int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args);
-    System.exit(res);
-  }
-}
Index: src/java/org/apache/nutch/tools/PruneIndexTool.java
===================================================================
--- src/java/org/apache/nutch/tools/PruneIndexTool.java	(revision 959954)
+++ src/java/org/apache/nutch/tools/PruneIndexTool.java	(working copy)
@@ -1,561 +0,0 @@
-/*
- * Created on Nov 2, 2004
- * Author: Andrzej Bialecki <ab@getopt.org>
- *
- */
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tools;
-
-import java.io.*;
-import java.util.*;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.*;
-import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.*;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.Version;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * This tool prunes existing Nutch indexes of unwanted content. The main method
- * accepts a list of segment directories (containing indexes). These indexes will
- * be pruned of any content that matches one or more query from a list of Lucene
- * queries read from a file (defined in standard config file, or explicitly
- * overridden from command-line). Segments should already be indexed, if some
- * of them are missing indexes then these segments will be skipped.
- * 
- * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge
- * of available Lucene document fields is required. This can be obtained by reading sources
- * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools
- * like <a href="http://www.getopt.org/luke">Luke</a>. During query parsing a
- * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of
- * Analyzer on the final set of query terms. You can use {@link org.apache.nutch.searcher.Query#main(String[])}
- * method to translate queries in Nutch syntax to queries in Lucene syntax.<br>
- * If additional level of control is required, an instance of {@link PruneChecker} can
- * be provided to check each document before it's deleted. The results of all
- * checkers are logically AND-ed, which means that any checker in the chain
- * can veto the deletion of the current document. Two example checker implementations
- * are provided - PrintFieldsChecker prints the values of selected index fields,
- * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can
- * be activated by providing respective command-line options.
- * </p>
- * <p>The typical command-line usage is as follows:<br>
- * <blockquote>
- * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br>
- * This command will just print out fields of matching documents.<br>
- * <code>PruneIndexTool index_dir -queries queries.txt</code><br>
- * This command will actually remove all matching entries, according to the
- * queries read from <code>queries.txt</code> file.
- * </blockquote></p>
- * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or
- * from a merged index). In particular it does NOT remove the pages and links
- * from WebDB. This means that unwanted URLs may pop up again when new segments
- * are created. To prevent this, use your own {@link org.apache.nutch.net.URLFilter},
- * or PruneDBTool (under construction...).</p>
- * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching
- * documents. For large indexes and broad queries this may result in high memory
- * consumption. If you encounter OutOfMemory exceptions, try to narrow down your
- * queries, or increase the heap size.</p>
- * 
- * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
- */
-public class PruneIndexTool implements Runnable {
-  public static final Log LOG = LogFactory.getLog(PruneIndexTool.class);
-  /** Log the progress every LOG_STEP number of processed documents. */
-  public static int LOG_STEP = 50000;
-  
-  /**
-   * This interface can be used to implement additional checking on matching
-   * documents.
-   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
-   */
-  public static interface PruneChecker {
-    /**
-     * Check whether this document should be pruned. NOTE: this method
-     * MUST NOT modify the IndexReader.
-     * @param reader index reader to read documents from
-     * @param docNum document ID
-     * @return true if the document should be deleted, false otherwise.
-     */
-    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception;
-    /**
-     * Close the checker - this could involve flushing output files or somesuch.
-     */
-    public void close();
-  }
-
-  /**
-   * This checker's main function is just to print out
-   * selected field values from each document, just before
-   * they are deleted.
-   * 
-   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
-   */
-  public static class PrintFieldsChecker implements PruneChecker {
-    private PrintStream ps = null;
-    private String[] fields = null;
-    
-    /**
-     * 
-     * @param ps an instance of PrintStream to print the information to
-     * @param fields a list of Lucene index field names. Values from these
-     * fields will be printed for every matching document.
-     */
-    public PrintFieldsChecker(PrintStream ps, String[] fields) {
-      this.ps = ps;
-      this.fields = fields;
-    }
-
-    public void close() {
-      ps.flush();
-    }
-    
-    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
-      Document doc = reader.document(docNum);
-      StringBuffer sb = new StringBuffer("#" + docNum + ":");
-      for (int i = 0; i < fields.length; i++) {
-        String[] values = doc.getValues(fields[i]);
-        sb.append(" " + fields[i] + "=");
-        if (values != null) {
-          for (int k = 0; k < values.length; k++) {
-            sb.append("[" + values[k] + "]");
-          }
-        } else sb.append("[null]");
-      }
-      ps.println(sb.toString());
-      return true;
-    }
-  }
-
-  /**
-   * This checker's main function is just to store
-   * the URLs of each document to be deleted in a text file.
-   * 
-   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
-   */
-  public static class StoreUrlsChecker implements PruneChecker {
-    private BufferedWriter output = null;
-    private boolean storeHomeUrl = false;
-    
-    /**
-     * Store the list in a file
-     * @param out name of the output file
-     */
-    public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception {
-      this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8"));
-      this.storeHomeUrl = storeHomeUrl;
-    }
-    
-    public void close() {
-      try {
-        output.flush();
-        output.close();
-      } catch (Exception e) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn("Error closing: " + e.getMessage());
-        }
-      }
-    }
-    
-    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
-      Document doc = reader.document(docNum);
-      String url = doc.get("url");
-      output.write(url); output.write('\n');
-      if (storeHomeUrl) {
-        // store also the main url
-        int idx = url.indexOf("://");
-        if (idx != -1) {
-          idx = url.indexOf('/', idx + 3);
-          if (idx != -1) {
-            output.write(url.substring(0, idx + 1) + "\n");
-          }
-        }
-      }
-      return true;
-    }
-  }
-
-  private Query[] queries = null;
-  private IndexReader reader = null;
-  private IndexSearcher searcher = null;
-  private PruneChecker[] checkers = null;
-  private boolean dryrun = false;
-  private String dr = "";
-  
-  /**
-   * Create an instance of the tool, and open all input indexes.
-   * @param indexDirs directories with input indexes. At least one valid index must
-   * exist, otherwise an Exception is thrown.
-   * @param queries pruning queries. Each query will be processed in turn, and the
-   * length of the array must be at least one, otherwise an Exception is thrown.
-   * @param checkers if not null, they will be used to perform additional
-   * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)}
-   * will be called in turn, for each matching document, and if it returns true this means that
-   * the document should be deleted. A logical AND is performed on the results returned
-   * by all checkers (which means that if one of them returns false, the document will
-   * not be deleted).
-   * @param unlock if true, and if any of the input indexes is locked, forcibly
-   * unlock it. Use with care, only when you are sure that other processes don't
-   * modify the index at the same time.
-   * @param dryrun if set to true, don't change the index, just show what would be done.
-   * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent
-   * PruneCheckers from performing changes or causing any other side-effects.
-   * @throws Exception
-   */
-  public PruneIndexTool(File[] indexDirs, Query[] queries, PruneChecker[] checkers,
-          boolean unlock, boolean dryrun) throws Exception {
-    if (indexDirs == null || queries == null)
-      throw new Exception("Invalid arguments.");
-    if (indexDirs.length == 0 || queries.length == 0)
-      throw new Exception("Nothing to do.");
-    this.queries = queries;
-    this.checkers = checkers;
-    this.dryrun = dryrun;
-    if (dryrun) dr = "[DRY RUN] ";
-    int numIdx = 0;
-    if (indexDirs.length == 1) {
-      Directory dir = FSDirectory.open(indexDirs[0]);
-      if (IndexWriter.isLocked(dir)) {
-        if (!unlock) {
-          throw new Exception("Index " + indexDirs[0] + " is locked.");
-        }
-        if (!dryrun) {
-          IndexWriter.unlock(dir);
-          if (LOG.isDebugEnabled()) {
-            LOG.debug(" - had to unlock index in " + dir);
-          }
-        }
-      }
-      reader = IndexReader.open(dir);
-      numIdx = 1;
-    } else {
-      Directory dir;
-      Vector<IndexReader> indexes = new Vector<IndexReader>(indexDirs.length);
-      for (int i = 0; i < indexDirs.length; i++) {
-        try {
-          dir = FSDirectory.open(indexDirs[i]);
-          if (IndexWriter.isLocked(dir)) {
-            if (!unlock) {
-              if (LOG.isWarnEnabled()) {
-                LOG.warn(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
-              }
-              continue;
-            }
-            if (!dryrun) {
-              IndexWriter.unlock(dir);
-              if (LOG.isDebugEnabled()) {
-                LOG.debug(" - had to unlock index in " + dir);
-              }
-            }
-          }
-          IndexReader r = IndexReader.open(dir);
-          indexes.add(r);
-          numIdx++;
-        } catch (Exception e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn(dr + "Invalid index in " + indexDirs[i] + " - skipping...");
-          }
-        }
-      }
-      if (indexes.size() == 0) throw new Exception("No input indexes.");
-      IndexReader[] readers = indexes.toArray(new IndexReader[0]);
-      reader = new MultiReader(readers);
-    }
-    if (LOG.isInfoEnabled()) {
-      LOG.info(dr + "Opened " + numIdx + " index(es) with total " +
-               reader.numDocs() + " documents.");
-    }
-    searcher = new IndexSearcher(reader);
-  }
-  
-  /**
-   * This class collects all matching document IDs in a BitSet.
-   * <p>NOTE: the reason to use this API is that the most common way of
-   * performing Lucene queries (Searcher.search(Query)::Hits) does NOT
-   * return all matching documents, because it skips very low scoring hits.</p>
-   * 
-   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
-   */
-  private static class AllHitsCollector extends Collector {
-    private BitSet bits;
-    
-    public AllHitsCollector(BitSet bits) {
-      this.bits = bits;
-    }
-
-    public void collect(int doc) {
-      bits.set(doc);
-    }
-
-    @Override
-    public boolean acceptsDocsOutOfOrder() {
-      return false;
-    }
-
-    @Override
-    public void setNextReader(IndexReader paramIndexReader, int paramInt) throws IOException {
-      // Do nothing.
-    }
-
-    @Override
-    public void setScorer(Scorer paramScorer) throws IOException {
-      // Do nothing.
-    }
-  }
-  
-  /**
-   * For each query, find all matching documents and delete them from all input
-   * indexes. Optionally, an additional check can be performed by using {@link PruneChecker}
-   * implementations.
-   */
-  public void run() {
-    BitSet bits = new BitSet(reader.maxDoc());
-    AllHitsCollector ahc = new AllHitsCollector(bits);
-    boolean doDelete = false;
-    for (int i = 0; i < queries.length; i++) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info(dr + "Processing query: " + queries[i].toString());
-      }
-      bits.clear();
-      try {
-        searcher.search(queries[i], ahc);
-      } catch (IOException e) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn(dr + " - failed: " + e.getMessage());
-        }
-        continue;
-      }
-      if (bits.cardinality() == 0) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info(dr + " - no matching documents.");
-        }
-        continue;
-      }
-      if (LOG.isInfoEnabled()) {
-        LOG.info(dr + " - found " + bits.cardinality() + " document(s).");
-      }
-      // Now delete all matching documents
-      int docNum = -1, start = 0, cnt = 0;
-      // probably faster than looping sequentially through all index values?
-      while ((docNum = bits.nextSetBit(start)) != -1) {
-        // don't delete the same document multiple times
-        if (reader.isDeleted(docNum)) continue;
-        try {
-          if (checkers != null && checkers.length > 0) {
-            boolean check = true;
-            for (int k = 0; k < checkers.length; k++) {
-              // fail if any checker returns false
-              check &= checkers[k].isPrunable(queries[i], reader, docNum);
-            }
-            doDelete = check;
-          } else doDelete = true;
-          if (doDelete) {
-            if (!dryrun) reader.deleteDocument(docNum);
-            cnt++;
-          }
-        } catch (Exception e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn(dr + " - failed to delete doc #" + docNum);
-          }
-        }
-        start = docNum + 1;
-      }
-      if (LOG.isInfoEnabled()) {
-        LOG.info(dr + " - deleted " + cnt + " document(s).");
-      }
-    }
-    // close checkers
-    if (checkers != null) {
-      for (int i = 0; i < checkers.length; i++) {
-        checkers[i].close();
-      }
-    }
-    try {
-      reader.close();
-    } catch (IOException e) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn(dr + "Exception when closing reader(s): " + e.getMessage());
-      }
-    }
-  }
-  
-  public static void main(String[] args) throws Exception {
-    if (args.length == 0) {
-      usage();
-      if (LOG.isFatalEnabled()) { LOG.fatal("Missing arguments"); }
-      return;
-    }
-    File idx = new File(args[0]);
-    if (!idx.isDirectory()) {
-      usage();
-      if (LOG.isFatalEnabled()) { LOG.fatal("Not a directory: " + idx); }
-      return;
-    }
-    Vector<File> paths = new Vector<File>();
-    if (IndexReader.indexExists(FSDirectory.open(idx))) {
-      paths.add(idx);
-    } else {
-      // try and see if there are segments inside, with index dirs
-      File[] dirs = idx.listFiles(new FileFilter() {
-        public boolean accept(File f) {
-          return f.isDirectory();
-        }
-      });
-      if (dirs == null || dirs.length == 0) {
-        usage();
-        if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx); }
-        return;
-      }
-      for (int i = 0; i < dirs.length; i++) {
-        File sidx = new File(dirs[i], "index");
-        if (sidx.exists() && sidx.isDirectory() 
-            && IndexReader.indexExists(FSDirectory.open(sidx))) {
-          paths.add(sidx);
-        }
-      }
-      if (paths.size() == 0) {
-        usage();
-        if (LOG.isFatalEnabled()) {
-          LOG.fatal("No indexes in " + idx + " or its subdirs.");
-        }
-        return;
-      }
-    }
-    File[] indexes = paths.toArray(new File[0]);
-    boolean force = false;
-    boolean dryrun = false;
-    String qPath = null;
-    String outPath = null;
-    String fList = null;
-    for (int i = 1; i < args.length; i++) {
-      if (args[i].equals("-force")) {
-        force = true;
-      } else if (args[i].equals("-queries")) {
-        qPath = args[++i];
-      } else if (args[i].equals("-output")) {
-        outPath = args[++i];
-      } else if (args[i].equals("-showfields")) {
-        fList = args[++i];
-      } else if (args[i].equals("-dryrun")) {
-        dryrun = true;
-      } else {
-        usage();
-        if (LOG.isFatalEnabled()) {
-          LOG.fatal("Unrecognized option: " + args[i]);
-        }
-        return;
-      }
-    }
-    Vector<PruneChecker> cv = new Vector<PruneChecker>();
-    if (fList != null) {
-      StringTokenizer st = new StringTokenizer(fList, ",");
-      Vector<String> tokens = new Vector<String>();
-      while (st.hasMoreTokens()) tokens.add(st.nextToken());
-      String[] fields = tokens.toArray(new String[0]);
-      PruneChecker pc = new PrintFieldsChecker(System.out, fields);
-      cv.add(pc);
-    }
-    
-    if (outPath != null) {
-      StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false);
-      cv.add(luc);
-    }
-
-    PruneChecker[] checkers = null;
-    if (cv.size() > 0) {
-      checkers = cv.toArray(new PruneChecker[0]);
-    }
-    Query[] queries = null;
-    InputStream is = null;
-    if (qPath != null) {
-      is = new FileInputStream(qPath);
-    } else {
-        Configuration conf = NutchConfiguration.create();
-        qPath = conf.get("prune.index.tool.queries");
-        is = conf.getConfResourceAsInputStream(qPath);
-    }
-    if (is == null) {
-      if (LOG.isFatalEnabled()) {
-        LOG.fatal("Can't load queries from " + qPath);
-      }
-      return;
-    }
-    try {
-      queries = parseQueries(is);
-    } catch (Exception e) {
-      if (LOG.isFatalEnabled()) {
-        LOG.fatal("Error parsing queries: " + e.getMessage());
-      }
-      return;
-    }
-    try {
-      PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun);
-      pit.run();
-    } catch (Exception e) {
-      if (LOG.isFatalEnabled()) {
-        LOG.fatal("Error running PruneIndexTool: " + e.getMessage());
-      }
-      return;
-    }
-  }
-  
-  /**
-   * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).
-   * There should be a single Lucene query per line. Blank lines and comments
-   * starting with '#' are allowed.
-   * <p>NOTE: you may wish to use {@link org.apache.nutch.searcher.Query#main(String[])}
-   * method to translate queries from Nutch format to Lucene format.</p>
-   * @param is InputStream to read from
-   * @return array of Lucene queries
-   * @throws Exception
-   */
-  public static Query[] parseQueries(InputStream is) throws Exception {
-    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-    String line = null;
-    QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "url", new WhitespaceAnalyzer());
-    Vector<Query> queries = new Vector<Query>();
-    while ((line = br.readLine()) != null) {
-      line = line.trim();
-      //skip blanks and comments
-      if (line.length() == 0 || line.charAt(0) == '#') continue;
-      Query q = qp.parse(line);
-      queries.add(q);
-    }
-    return queries.toArray(new Query[0]);
-  }
-  
-  private static void usage() {
-    System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
-    System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
-    System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
-    System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
-    System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
-    System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
-    System.err.println("\t-output filename\tstore pruned URLs in a text file");
-    System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
-    System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
-    System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
-  }
-}
Index: src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
===================================================================
--- src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java	(revision 959954)
+++ src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java	(working copy)
@@ -1,149 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tools.compat;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.Random;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.MapWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.MapFileOutputFormat;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/**
- * This tool converts CrawlDb created in old &lt;UTF8, CrawlDatum&gt; format
- * (Nutch versions < 0.9.0) to the new &lt;Text, CrawlDatum&gt; format.
- * Optionally {@link org.apache.nutch.crawl.CrawlDatum#metaData} can be converted
- * too from using UTF8 keys to using Text keys.
- * 
- * @author Andrzej Bialecki
- */
-public class CrawlDbConverter extends Configured implements Tool,
-    Mapper<WritableComparable, CrawlDatum, Text, CrawlDatum> {
-  private static final Log LOG = LogFactory.getLog(CrawlDbConverter.class);
-  
-  private static final String CONVERT_META_KEY = "db.converter.with.metadata";
-
-  private boolean withMetadata;
-  private Text newKey;
-  
-  public void configure(JobConf job) {
-    setConf(job);
-    withMetadata = job.getBoolean(CONVERT_META_KEY, false);
-    newKey = new Text();
-  }
-
-  public void map(WritableComparable key, CrawlDatum value,
-      OutputCollector<Text, CrawlDatum> output,
-      Reporter reporter) throws IOException {
-    newKey.set(key.toString());
-    if (withMetadata) {
-      CrawlDatum datum = (CrawlDatum)value;
-      MapWritable meta = datum.getMetaData();
-      if (meta.size() > 0) {
-        MapWritable newMeta = new MapWritable();
-        Iterator it = meta.keySet().iterator();
-        while (it.hasNext()) {
-          WritableComparable k = (WritableComparable)it.next();
-          Writable v = meta.get(k);
-          if (k instanceof UTF8) {
-            Text t = new Text(k.toString());
-            k = t;
-          }
-          newMeta.put(k, v);
-        }
-        datum.setMetaData(newMeta);
-      }
-    }
-    output.collect(newKey, value);
-  }
-
-  public void close() throws IOException {
-  }
-
-  /**
-   * @param args
-   */
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbConverter(), args);
-    System.exit(res);
-  }
-
-  public int run(String[] args) throws Exception {
-    if (args.length == 0) {
-      System.err.println("Usage: CrawlDbConverter <oldDb> <newDb> [-withMetadata]");
-      System.err.println("\toldDb\tname of the crawldb that uses UTF8 class.");
-      System.err.println("\tnewDb\tname of the output crawldb that will use Text class.");
-      System.err.println("\twithMetadata\tconvert also all metadata keys that use UTF8 to Text.");
-      return -1;
-    }
-    JobConf job = new NutchJob(getConf());
-    FileSystem fs = FileSystem.get(getConf());
-    Path oldDb = new Path(args[0], CrawlDb.CURRENT_NAME);
-    Path newDb =
-      new Path(oldDb,
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-    if (!fs.exists(oldDb)) {
-      LOG.fatal("Old db doesn't exist in '" + args[0] + "'");
-      return -1;
-    }
-    boolean withMetadata = false;
-    if (args.length > 2 && args[2].equalsIgnoreCase("-withMetadata"))
-      withMetadata = true;
-    
-    job.setBoolean(CONVERT_META_KEY, withMetadata);
-    FileInputFormat.addInputPath(job, oldDb);
-    job.setInputFormat(SequenceFileInputFormat.class);
-    job.setMapperClass(CrawlDbConverter.class);
-    job.setOutputFormat(MapFileOutputFormat.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(CrawlDatum.class);
-    FileOutputFormat.setOutputPath(job, newDb);
-    try {
-      JobClient.runJob(job);
-      CrawlDb.install(job, new Path(args[1]));
-      return 0;
-    } catch (Exception e) {
-      LOG.fatal("Error: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
-}
Index: src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java
===================================================================
--- src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java	(revision 959954)
+++ src/java/org/apache/nutch/tools/compat/ReprUrlFixer.java	(working copy)
@@ -1,312 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.tools.compat;
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.MapWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.MapFileOutputFormat;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.CrawlDb;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.scoring.webgraph.Node;
-import org.apache.nutch.util.FSUtils;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.URLUtil;
-
-/**
- * <p>
- * Significant changes were made to representative url logic used for redirects.
- * This tool will fix representative urls stored in current segments and crawl
- * databases. Any new fetches will use the new representative url logic.
- * </p>
- * 
- * <p>
- * All crawl datums are assumed to be temp url redirects. While this may cause
- * some urls to be incorrectly removed, this tool is a temporary measure to be
- * used until fetches can be rerun. This reduce logic is the same for segments
- * fetch and parse directory as well as for existing crawl databases.
- * </p>
- */
-public class ReprUrlFixer
-  extends Configured
-  implements Tool, Reducer<Text, CrawlDatum, Text, CrawlDatum> {
-
-  public static final Log LOG = LogFactory.getLog(ReprUrlFixer.class);
-  private JobConf conf;
-
-  public void configure(JobConf conf) {
-    this.conf = conf;
-  }
-
-  /**
-   * Runs the new ReprUrl logic on all crawldatums.
-   */
-  public void reduce(Text key, Iterator<CrawlDatum> values,
-    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
-    throws IOException {
-
-    String url = key.toString();
-    Node node = null;
-    List<CrawlDatum> datums = new ArrayList<CrawlDatum>();
-
-    // get all crawl datums for a given url key, fetch for instance can have
-    // more than one under a given key if there are multiple redirects to a
-    // given url
-    while (values.hasNext()) {
-      CrawlDatum datum = values.next();
-      datums.add((CrawlDatum)WritableUtils.clone(datum, conf));
-    }
-
-    // apply redirect repr url logic for each datum
-    for (CrawlDatum datum : datums) {
-
-      MapWritable metadata = datum.getMetaData();
-      Text reprUrl = (Text)metadata.get(Nutch.WRITABLE_REPR_URL_KEY);
-      byte status = datum.getStatus();
-      boolean isCrawlDb = (CrawlDatum.hasDbStatus(datum));
-      boolean segFetched = (status == CrawlDatum.STATUS_FETCH_SUCCESS);
-
-      // only if the crawl datum is from the crawldb or is a successfully
-      // fetched page from the segments
-      if ((isCrawlDb || segFetched) && reprUrl != null) {
-
-        String src = reprUrl.toString();
-        String dest = url;
-        URL srcUrl = null;
-        URL dstUrl = null;
-
-        // both need to be well formed urls
-        try {
-          srcUrl = new URL(src);
-          dstUrl = new URL(url);
-        }
-        catch (MalformedURLException e) {
-        }
-
-        // if the src and repr urls are the same after the new logic then
-        // remove the repr url from the metadata as it is no longer needed
-        if (srcUrl != null && dstUrl != null) {
-          String reprOut = URLUtil.chooseRepr(src, dest, true);
-          if (reprOut.equals(dest)) {
-            LOG.info("Removing " + reprOut + " from " + dest);
-            metadata.remove(Nutch.WRITABLE_REPR_URL_KEY);
-          }
-        }
-      }
-
-      // collect each datum
-      output.collect(key, datum);
-    }
-
-  }
-
-  public void close() {
-  }
-
-  /**
-   * Run the fixer on any crawl database and segments specified.
-   */
-  public void update(Path crawlDb, Path[] segments)
-    throws IOException {
-
-    Configuration conf = getConf();
-    FileSystem fs = FileSystem.get(conf);
-
-    // run the crawl database through the repr fixer
-    if (crawlDb != null) {
-
-      LOG.info("Running ReprUtilFixer " + crawlDb);
-      Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
-      Path newCrawlDb = new Path(crawlDb,
-        Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-      JobConf updater = new NutchJob(conf);
-      updater.setJobName("ReprUtilFixer: " + crawlDb.toString());
-      FileInputFormat.addInputPath(updater, crawlDbCurrent);
-      FileOutputFormat.setOutputPath(updater, newCrawlDb);
-      updater.setInputFormat(SequenceFileInputFormat.class);
-      updater.setReducerClass(ReprUrlFixer.class);
-      updater.setOutputKeyClass(Text.class);
-      updater.setOutputValueClass(CrawlDatum.class);
-      updater.setOutputFormat(MapFileOutputFormat.class);
-
-      try {
-        JobClient.runJob(updater);
-        LOG.info("Installing new crawldb " + crawlDb);
-        CrawlDb.install(updater, crawlDb);
-      }
-      catch (IOException e) {
-        LOG.error(StringUtils.stringifyException(e));
-        throw e;
-      }
-    }
-
-    // run the segments through the repr fixer, logic will be run on both the
-    // crawl_parse and the crawl_fetch directories for every segment specified
-    if (segments != null) {
-
-      for (int i = 0; i < segments.length; i++) {
-
-        Path segment = segments[i];
-        LOG.info("Running ReprUtilFixer " + segment + " fetch");
-        Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
-        Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-"
-          + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-        JobConf fetch = new NutchJob(conf);
-        fetch.setJobName("ReprUtilFixer: " + segment.toString());
-        FileInputFormat.addInputPath(fetch, segFetch);
-        FileOutputFormat.setOutputPath(fetch, newSegFetch);
-        fetch.setInputFormat(SequenceFileInputFormat.class);
-        fetch.setReducerClass(ReprUrlFixer.class);
-        fetch.setOutputKeyClass(Text.class);
-        fetch.setOutputValueClass(CrawlDatum.class);
-        fetch.setOutputFormat(MapFileOutputFormat.class);
-
-        try {
-          JobClient.runJob(fetch);
-          LOG.info("Installing new segment fetch directory " + newSegFetch);
-          FSUtils.replace(fs, segFetch, newSegFetch, true);
-          LOG.info("ReprUrlFixer: finished installing segment fetch directory");
-        }
-        catch (IOException e) {
-          LOG.error(StringUtils.stringifyException(e));
-          throw e;
-        }
-
-        LOG.info("Running ReprUtilFixer " + segment + " parse");
-        Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME);
-        Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-"
-          + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-        JobConf parse = new NutchJob(conf);
-        parse.setJobName("ReprUtilFixer: " + segment.toString());
-        FileInputFormat.addInputPath(parse, segParse);
-        FileOutputFormat.setOutputPath(parse, newSegParse);
-        parse.setInputFormat(SequenceFileInputFormat.class);
-        parse.setReducerClass(ReprUrlFixer.class);
-        parse.setOutputKeyClass(Text.class);
-        parse.setOutputValueClass(CrawlDatum.class);
-        parse.setOutputFormat(MapFileOutputFormat.class);
-
-        try {
-          JobClient.runJob(parse);
-          LOG.info("Installing new segment parse directry " + newSegParse);
-          FSUtils.replace(fs, segParse, newSegParse, true);
-          LOG.info("ReprUrlFixer: finished installing segment parse directory");
-        }
-        catch (IOException e) {
-          LOG.error(StringUtils.stringifyException(e));
-          throw e;
-        }
-      }
-    }
-  }
-
-  /**
-   * Runs The ReprUrlFixer.
-   */
-  public static void main(String[] args)
-    throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new ReprUrlFixer(),
-      args);
-    System.exit(res);
-  }
-
-  /**
-   * Parse command line options and execute the main update logic.
-   */
-  public int run(String[] args)
-    throws Exception {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option crawlDbOpts = OptionBuilder.withArgName("crawldb").hasArg().withDescription(
-      "the crawldb to use").create("crawldb");
-    Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(
-      "the segment(s) to use").create("segment");
-    options.addOption(helpOpts);
-    options.addOption(crawlDbOpts);
-    options.addOption(segOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      // parse out common line arguments and make sure either a crawldb or a
-      // segment are specified
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help")
-        || (!line.hasOption("crawldb") && !line.hasOption("segment"))) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("ReprUtilFixer", options);
-        return -1;
-      }
-
-      // create paths for all of the segments specified, multiple segments may
-      // be run at once
-      String crawlDb = line.getOptionValue("crawldb");
-      String[] segments = line.getOptionValues("segment");
-      Path[] segPaths = new Path[segments != null ? segments.length : 0];
-      if (segments != null) {
-        for (int i = 0; i < segments.length; i++) {
-          segPaths[i] = new Path(segments[i]);
-        }
-      }
-      update(new Path(crawlDb), segPaths);
-      return 0;
-    }
-    catch (Exception e) {
-      LOG.fatal("ReprUtilFixer: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
-}
Index: src/java/org/apache/nutch/tools/SearchLoadTester.java
===================================================================
--- src/java/org/apache/nutch/tools/SearchLoadTester.java	(revision 959954)
+++ src/java/org/apache/nutch/tools/SearchLoadTester.java	(working copy)
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.tools;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.Options;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.searcher.Hits;
-import org.apache.nutch.searcher.NutchBean;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * <p>A simple tool to perform load testing on configured search servers.  A 
- * queries file can be specified with a list of different queries to run against
- * the search servers.  The number of threads used to perform concurrent
- * searches is also configurable.</p>
- * 
- * <p>This tool will output approximate times for running all queries in the 
- * queries file.  If configured it will also print out individual queries times
- * to the log.</p>
- */
-public class SearchLoadTester {
-
-  public static final Log LOG = LogFactory.getLog(SearchLoadTester.class);
-
-  private String queriesFile = null;
-  private int numThreads = 100;
-  private boolean showTimes = false;
-  private ExecutorService pool = null;
-  private static AtomicInteger numTotal = new AtomicInteger(0);
-  private static AtomicInteger numErrored = new AtomicInteger(0);
-  private static AtomicInteger numResolved = new AtomicInteger(0);
-  private static AtomicLong totalTime = new AtomicLong(0L);
-
-  private static Configuration conf = null;
-  private static NutchBean bean = null;
-
-  private static class SearchThread
-    extends Thread {
-
-    private String query = null;
-    private boolean showTimes = false;
-
-    public SearchThread(String query, boolean showTimes) {
-      this.query = query;
-      this.showTimes = showTimes;
-    }
-
-    public void run() {
-
-      numTotal.incrementAndGet();
-
-      try {
-        Query runner = Query.parse(query, conf);
-        long start = System.currentTimeMillis();
-        Hits hits = bean.search(runner, 10);
-        long end = System.currentTimeMillis();
-        numResolved.incrementAndGet();
-        long total = (end - start);
-        if (showTimes) {
-          System.out.println("Query for " + query + " numhits "
-            + hits.getTotal() + " in " + total + "ms");
-        }
-        totalTime.addAndGet(total);
-      }
-      catch (Exception uhe) {
-        LOG.info("Error executing search for " + query);
-        numErrored.incrementAndGet();
-      }
-    }
-  }
-
-  public void testSearch() {
-
-    try {
-
-      // create a thread pool with a fixed number of threads
-      pool = Executors.newFixedThreadPool(numThreads);
-
-      // read in the queries file and loop through each line, one query per line
-      BufferedReader buffRead = new BufferedReader(new FileReader(new File(
-        queriesFile)));
-      String queryStr = null;
-      while ((queryStr = buffRead.readLine()) != null) {
-        pool.execute(new SearchThread(queryStr, showTimes));
-      }
-
-      // close the file and wait for up to 60 seconds before shutting down
-      // the thread pool to give urls time to finish resolving
-      buffRead.close();
-      pool.shutdown();
-      pool.awaitTermination(60, TimeUnit.SECONDS);
-
-      LOG.info("Total Queries: " + numTotal.get() + ", Errored: "
-        + numErrored.get() + ", Total Time: " + totalTime.get()
-        + ", Average Time: " + totalTime.get() / numTotal.get()
-        + " with " + numThreads + " threads");
-    }
-    catch (Exception e) {
-      e.printStackTrace();
-      // on error shutdown the thread pool immediately
-      pool.shutdownNow();
-      LOG.info(StringUtils.stringifyException(e));
-    }
-
-  }
-
-  public SearchLoadTester(String queriesFile)
-    throws IOException {
-    this(queriesFile, 100, false);
-  }
-
-  public SearchLoadTester(String queriesFile, int numThreads, boolean showTimes)
-    throws IOException {
-    this.queriesFile = queriesFile;
-    this.numThreads = numThreads;
-    this.showTimes = showTimes;
-    this.conf = NutchConfiguration.create();
-    this.bean = new NutchBean(conf);
-  }
-
-  public static void main(String[] args) {
-
-    Options options = new Options();
-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
-      "show this help message").create("help");
-    Option queriesOpts = OptionBuilder.withArgName("queries").hasArg().withDescription(
-      "the queries file to test").create("queries");
-    Option numThreadOpts = OptionBuilder.withArgName("numThreads").hasArgs().withDescription(
-      "the number of threads to use").create("numThreads");
-    Option showTimesOpts = OptionBuilder.withArgName("showTimes").withDescription(
-      "show individual query times").create("showTimes");
-    options.addOption(helpOpts);
-    options.addOption(queriesOpts);
-    options.addOption(numThreadOpts);
-    options.addOption(showTimesOpts);
-
-    CommandLineParser parser = new GnuParser();
-    try {
-
-      // parse out common line arguments
-      CommandLine line = parser.parse(options, args);
-      if (line.hasOption("help") || !line.hasOption("queries")) {
-        HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("SearchTester", options);
-        return;
-      }
-
-      // get the urls and the number of threads and start the resolver
-      boolean showTimes = line.hasOption("showTimes");
-      String queries = line.getOptionValue("queries");
-      int numThreads = 10;
-      String numThreadsStr = line.getOptionValue("numThreads");
-      if (numThreadsStr != null) {
-        numThreads = Integer.parseInt(numThreadsStr);
-      }
-      SearchLoadTester tester = new SearchLoadTester(queries, numThreads, showTimes);
-      tester.testSearch();
-    }
-    catch (Exception e) {
-      LOG.fatal("SearchTester: " + StringUtils.stringifyException(e));
-    }
-  }
-
-}
Index: src/java/org/apache/nutch/html/Entities.java
===================================================================
--- src/java/org/apache/nutch/html/Entities.java	(revision 959954)
+++ src/java/org/apache/nutch/html/Entities.java	(working copy)
@@ -1,329 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.html;
-
-import java.util.*;
-
-public class Entities {
-  static final Hashtable<String, String> decoder =
-    new Hashtable<String, String>(300);
-  static final String[]  encoder = new String[0x100];
-
-  static final String decode(String entity) {
-    if (entity.charAt(entity.length()-1) == ';')  // remove trailing semicolon
-      entity = entity.substring(0, entity.length()-1);
-    if (entity.charAt(1) == '#') {
-      int start = 2;
-      int radix = 10;
-      if (entity.charAt(2) == 'X' || entity.charAt(2) == 'x') {
-	start++;
-	radix = 16;
-      }
-      Character c =
-	new Character((char)Integer.parseInt(entity.substring(start), radix));
-      return c.toString();
-    } else {
-      String s = decoder.get(entity);
-      if (s != null)
-	return s;
-      else return "";
-    }
-  }
-
-  static final public String encode(String s) {
-    int length = s.length();
-    StringBuffer buffer = new StringBuffer(length * 2);
-    for (int i = 0; i < length; i++) {
-      char c = s.charAt(i);
-      int j = (int)c;
-      if (j < 0x100 && encoder[j] != null) {
-	buffer.append(encoder[j]);		  // have a named encoding
-	buffer.append(';');
-      } else if (j < 0x80) {
-	buffer.append(c);			  // use ASCII value
-      } else {
-	buffer.append("&#");			  // use numeric encoding
-	buffer.append((int)c);
-	buffer.append(';');
-      }
-    }
-    return buffer.toString();
-  }
-
-  static final void add(String entity, int value) {
-    decoder.put(entity, (new Character((char)value)).toString());
-    if (value < 0x100)
-      encoder[value] = entity;
-  }
-
-  static {
-    add("&nbsp",   160);
-    add("&iexcl",  161);
-    add("&cent",   162);
-    add("&pound",  163);
-    add("&curren", 164);
-    add("&yen",    165);
-    add("&brvbar", 166);
-    add("&sect",   167);
-    add("&uml",    168);
-    add("&copy",   169);
-    add("&ordf",   170);
-    add("&laquo",  171);
-    add("&not",    172);
-    add("&shy",    173);
-    add("&reg",    174);
-    add("&macr",   175);
-    add("&deg",    176);
-    add("&plusmn", 177);
-    add("&sup2",   178);
-    add("&sup3",   179);
-    add("&acute",  180);
-    add("&micro",  181);
-    add("&para",   182);
-    add("&middot", 183);
-    add("&cedil",  184);
-    add("&sup1",   185);
-    add("&ordm",   186);
-    add("&raquo",  187);
-    add("&frac14", 188);
-    add("&frac12", 189);
-    add("&frac34", 190);
-    add("&iquest", 191);
-    add("&Agrave", 192);
-    add("&Aacute", 193);
-    add("&Acirc",  194);
-    add("&Atilde", 195);
-    add("&Auml",   196);
-    add("&Aring",  197);
-    add("&AElig",  198);
-    add("&Ccedil", 199);
-    add("&Egrave", 200);
-    add("&Eacute", 201);
-    add("&Ecirc",  202);
-    add("&Euml",   203);
-    add("&Igrave", 204);
-    add("&Iacute", 205);
-    add("&Icirc",  206);
-    add("&Iuml",   207);
-    add("&ETH",    208);
-    add("&Ntilde", 209);
-    add("&Ograve", 210);
-    add("&Oacute", 211);
-    add("&Ocirc",  212);
-    add("&Otilde", 213);
-    add("&Ouml",   214);
-    add("&times",  215);
-    add("&Oslash", 216);
-    add("&Ugrave", 217);
-    add("&Uacute", 218);
-    add("&Ucirc",  219);
-    add("&Uuml",   220);
-    add("&Yacute", 221);
-    add("&THORN",  222);
-    add("&szlig",  223);
-    add("&agrave", 224);
-    add("&aacute", 225);
-    add("&acirc",  226);
-    add("&atilde", 227);
-    add("&auml",   228);
-    add("&aring",  229);
-    add("&aelig",  230);
-    add("&ccedil", 231);
-    add("&egrave", 232);
-    add("&eacute", 233);
-    add("&ecirc",  234);
-    add("&euml",   235);
-    add("&igrave", 236);
-    add("&iacute", 237);
-    add("&icirc",  238);
-    add("&iuml",   239);
-    add("&eth",    240);
-    add("&ntilde", 241);
-    add("&ograve", 242);
-    add("&oacute", 243);
-    add("&ocirc",  244);
-    add("&otilde", 245);
-    add("&ouml",   246);
-    add("&divide", 247);
-    add("&oslash", 248);
-    add("&ugrave", 249);
-    add("&uacute", 250);
-    add("&ucirc",  251);
-    add("&uuml",   252);
-    add("&yacute", 253);
-    add("&thorn",  254);
-    add("&yuml",   255);
-    add("&fnof",   402);
-    add("&Alpha",  913);
-    add("&Beta",   914);
-    add("&Gamma",  915);
-    add("&Delta",  916);
-    add("&Epsilon",917);
-    add("&Zeta",   918);
-    add("&Eta",    919);
-    add("&Theta",  920);
-    add("&Iota",   921);
-    add("&Kappa",  922);
-    add("&Lambda", 923);
-    add("&Mu",     924);
-    add("&Nu",     925);
-    add("&Xi",     926);
-    add("&Omicron",927);
-    add("&Pi",     928);
-    add("&Rho",    929);
-    add("&Sigma",  931);
-    add("&Tau",    932);
-    add("&Upsilon",933);
-    add("&Phi",    934);
-    add("&Chi",    935);
-    add("&Psi",    936);
-    add("&Omega",  937);
-    add("&alpha",  945);
-    add("&beta",   946);
-    add("&gamma",  947);
-    add("&delta",  948);
-    add("&epsilon",949);
-    add("&zeta",   950);
-    add("&eta",    951);
-    add("&theta",  952);
-    add("&iota",   953);
-    add("&kappa",  954);
-    add("&lambda", 955);
-    add("&mu",     956);
-    add("&nu",     957);
-    add("&xi",     958);
-    add("&omicron",959);
-    add("&pi",     960);
-    add("&rho",    961);
-    add("&sigmaf", 962);
-    add("&sigma",  963);
-    add("&tau",    964);
-    add("&upsilon",965);
-    add("&phi",    966);
-    add("&chi",    967);
-    add("&psi",    968);
-    add("&omega",  969);
-    add("&thetasym",977);
-    add("&upsih",  978);
-    add("&piv",    982);
-    add("&bull",   8226);
-    add("&hellip", 8230);
-    add("&prime",  8242);
-    add("&Prime",  8243);
-    add("&oline",  8254);
-    add("&frasl",  8260);
-    add("&weierp", 8472);
-    add("&image",  8465);
-    add("&real",   8476);
-    add("&trade",  8482);
-    add("&alefsym",8501);
-    add("&larr",   8592);
-    add("&uarr",   8593);
-    add("&rarr",   8594);
-    add("&darr",   8595);
-    add("&harr",   8596);
-    add("&crarr",  8629);
-    add("&lArr",   8656);
-    add("&uArr",   8657);
-    add("&rArr",   8658);
-    add("&dArr",   8659);
-    add("&hArr",   8660);
-    add("&forall", 8704);
-    add("&part",   8706);
-    add("&exist",  8707);
-    add("&empty",  8709);
-    add("&nabla",  8711);
-    add("&isin",   8712);
-    add("&notin",  8713);
-    add("&ni",     8715);
-    add("&prod",   8719);
-    add("&sum",    8721);
-    add("&minus",  8722);
-    add("&lowast", 8727);
-    add("&radic",  8730);
-    add("&prop",   8733);
-    add("&infin",  8734);
-    add("&ang",    8736);
-    add("&and",    8743);
-    add("&or",     8744);
-    add("&cap",    8745);
-    add("&cup",    8746);
-    add("&int",    8747);
-    add("&there4", 8756);
-    add("&sim",    8764);
-    add("&cong",   8773);
-    add("&asymp",  8776);
-    add("&ne",     8800);
-    add("&equiv",  8801);
-    add("&le",     8804);
-    add("&ge",     8805);
-    add("&sub",    8834);
-    add("&sup",    8835);
-    add("&nsub",   8836);
-    add("&sube",   8838);
-    add("&supe",   8839);
-    add("&oplus",  8853);
-    add("&otimes", 8855);
-    add("&perp",   8869);
-    add("&sdot",   8901);
-    add("&lceil",  8968);
-    add("&rceil",  8969);
-    add("&lfloor", 8970);
-    add("&rfloor", 8971);
-    add("&lang",   9001);
-    add("&rang",   9002);
-    add("&loz",    9674);
-    add("&spades", 9824);
-    add("&clubs",  9827);
-    add("&hearts", 9829);
-    add("&diams",  9830);
-    add("&quot",   34);
-    add("&amp",    38);
-    add("&lt",     60);
-    add("&gt",     62);
-    add("&OElig",  338);
-    add("&oelig",  339);
-    add("&Scaron", 352);
-    add("&scaron", 353);
-    add("&Yuml",   376);
-    add("&circ",   710);
-    add("&tilde",  732);
-    add("&ensp",   8194);
-    add("&emsp",   8195);
-    add("&thinsp", 8201);
-    add("&zwnj",   8204);
-    add("&zwj",    8205);
-    add("&lrm",    8206);
-    add("&rlm",    8207);
-    add("&ndash",  8211);
-    add("&mdash",  8212);
-    add("&lsquo",  8216);
-    add("&rsquo",  8217);
-    add("&sbquo",  8218);
-    add("&ldquo",  8220);
-    add("&rdquo",  8221);
-    add("&bdquo",  8222);
-    add("&dagger", 8224);
-    add("&Dagger", 8225);
-    add("&permil", 8240);
-    add("&lsaquo", 8249);
-    add("&rsaquo", 8250);
-    add("&euro",   8364);
-
-  }
-}
Index: src/java/org/apache/nutch/crawl/Crawl.java
===================================================================
--- src/java/org/apache/nutch/crawl/Crawl.java	(revision 959954)
+++ src/java/org/apache/nutch/crawl/Crawl.java	(working copy)
@@ -29,9 +29,7 @@
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.nutch.parse.ParseSegment;
-import org.apache.nutch.indexer.DeleteDuplicates;
-import org.apache.nutch.indexer.IndexMerger;
-import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.indexer.solr.SolrDeleteDuplicates;
 import org.apache.nutch.indexer.solr.SolrIndexer;
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
@@ -52,8 +50,7 @@
   public static void main(String args[]) throws Exception {
     if (args.length < 1) {
       System.out.println
-      ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]" +
-        " [-solr solrURL]");
+      ("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
       return;
     }
 
@@ -65,7 +62,6 @@
     int threads = job.getInt("fetcher.threads.fetch", 10);
     int depth = 5;
     long topN = Long.MAX_VALUE;
-    String indexerName = "lucene";
     String solrUrl = null;
     
     for (int i = 0; i < args.length; i++) {
@@ -82,15 +78,17 @@
           topN = Integer.parseInt(args[i+1]);
           i++;
       } else if ("-solr".equals(args[i])) {
-        indexerName = "solr";
         solrUrl = StringUtils.lowerCase(args[i + 1]);
         i++;
       } else if (args[i] != null) {
         rootUrlDir = new Path(args[i]);
       }
     }
+    
+    if (solrUrl == null) {
+      LOG.warn("solrUrl is not set, indexing will be skipped...");
+    }
 
-    boolean isSolrIndex = StringUtils.equalsIgnoreCase(indexerName, "solr");
     FileSystem fs = FileSystem.get(job);
 
     if (LOG.isInfoEnabled()) {
@@ -98,10 +96,7 @@
       LOG.info("rootUrlDir = " + rootUrlDir);
       LOG.info("threads = " + threads);
       LOG.info("depth = " + depth);      
-      LOG.info("indexer=" + indexerName);
-      if (isSolrIndex) {
-        LOG.info("solrUrl=" + solrUrl);
-      }
+      LOG.info("solrUrl=" + solrUrl);
       if (topN != Long.MAX_VALUE)
         LOG.info("topN = " + topN);
     }
@@ -139,41 +134,16 @@
     if (i > 0) {
       linkDbTool.invert(linkDb, segments, true, true, false); // invert links
 
-      // index, dedup & merge
-      FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
-      if (isSolrIndex) {
+      if (solrUrl != null) {
+        // index, dedup & merge
+        FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
         SolrIndexer indexer = new SolrIndexer(conf);
         indexer.indexSolr(solrUrl, crawlDb, linkDb, 
-            Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+          Arrays.asList(HadoopFSUtil.getPaths(fstats)));
+        SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
+        dedup.setConf(conf);
+        dedup.dedup(solrUrl);
       }
-      else {
-        
-        DeleteDuplicates dedup = new DeleteDuplicates(conf);        
-        if(indexes != null) {
-          // Delete old indexes
-          if (fs.exists(indexes)) {
-            LOG.info("Deleting old indexes: " + indexes);
-            fs.delete(indexes, true);
-          }
-
-          // Delete old index
-          if (fs.exists(index)) {
-            LOG.info("Deleting old merged index: " + index);
-            fs.delete(index, true);
-          }
-        }
-        
-        Indexer indexer = new Indexer(conf);
-        indexer.index(indexes, crawlDb, linkDb, 
-            Arrays.asList(HadoopFSUtil.getPaths(fstats)));
-        
-        IndexMerger merger = new IndexMerger(conf);
-        if(indexes != null) {
-          dedup.dedup(new Path[] { indexes });
-          fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
-          merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
-        }
-      }    
       
     } else {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
Index: src/java/org/apache/nutch/crawl/NutchWritable.java
===================================================================
--- src/java/org/apache/nutch/crawl/NutchWritable.java	(revision 959954)
+++ src/java/org/apache/nutch/crawl/NutchWritable.java	(working copy)
@@ -45,9 +45,6 @@
       org.apache.nutch.parse.ParseStatus.class,
       org.apache.nutch.protocol.Content.class,
       org.apache.nutch.protocol.ProtocolStatus.class,
-      org.apache.nutch.searcher.Hit.class,
-      org.apache.nutch.searcher.HitDetails.class,
-      org.apache.nutch.searcher.Hits.class
     };
   }
 
Index: src/java/org/apache/nutch/ontology/OntologyFactory.java
===================================================================
--- src/java/org/apache/nutch/ontology/OntologyFactory.java	(revision 959954)
+++ src/java/org/apache/nutch/ontology/OntologyFactory.java	(working copy)
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.ontology;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.nutch.plugin.*;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * A factory for retrieving {@link Ontology} extensions.
- *
- * @author Michael Pan
- * @version $Id$
- */
-public class OntologyFactory {
-  public final static Log LOG = LogFactory.getLog(OntologyFactory.class);
-
-  private ExtensionPoint extensionPoint;
-  private Configuration conf;
-  
-  public OntologyFactory(Configuration conf) {
-    this.conf = conf;
-    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(Ontology.X_POINT_ID);  
-  }
-
-  /**
-  * @return Returns the online ontology extension specified
-  * in nutch configuration's key
-  * <code>extension.ontology.extension-name</code>.
-  * If the name is  empty (no preference),
-  * the first available ontology extension is returned.
-  */
-  public Ontology getOntology() throws PluginRuntimeException {
-     
-    if (this.extensionPoint == null) {
-      // not even an extension point defined.
-      return null;
-    }
-
-    String extensionName = this.conf.get("extension.ontology.extension-name");
-    if (extensionName != null) {
-      Extension extension = findExtension(extensionName);
-      if (extension != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Using ontology extension: " + extensionName);
-        }
-        return (Ontology) extension.getExtensionInstance();
-      }
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Ontology extension not found: '" + extensionName +
-                 "', trying the default");
-      }
-      // not found, fallback to the default, if available.
-    }
-
-    Extension[] extensions = this.extensionPoint.getExtensions();
-    if (extensions.length > 0) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Using the first ontology extension found: " +
-                 extensions[0].getId());
-      }
-      return (Ontology) extensions[0].getExtensionInstance();
-    } else {
-      return null;
-    }
-
-  }
-
-  private Extension findExtension(String name)
-    throws PluginRuntimeException {
-
-    Extension[] extensions = this.extensionPoint.getExtensions();
-
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-
-      if (name.equals(extension.getId()))
-        return extension;
-    }
-
-    return null;
-  }
-
-} 
Index: src/java/org/apache/nutch/ontology/Ontology.java
===================================================================
--- src/java/org/apache/nutch/ontology/Ontology.java	(revision 959954)
+++ src/java/org/apache/nutch/ontology/Ontology.java	(working copy)
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.ontology;
-
-// JDK imports
-import java.util.Iterator;
-
-// Nutch imports
-import org.apache.nutch.plugin.Pluggable;
-
-
-public interface Ontology extends Pluggable {
-  /** The name of the extension point. */
-  public final static String X_POINT_ID = Ontology.class.getName();
-
-  public void load(String[] urls);
-
-  // not yet implemented
-  //public void merge(Ontology o);
-
-  public Iterator subclasses(String entitySearchTerm);
-
-  public Iterator synonyms(String queryKeyPhrase);
-}
Index: src/engines/Inktomi.src
===================================================================
--- src/engines/Inktomi.src	(revision 959954)
+++ src/engines/Inktomi.src	(working copy)
@@ -1,25 +0,0 @@
-# Inktomi/Hotbot plugin
-
-<search 
-   name="InktomiHotbot"
-   description="Inktomi Search via Hotbot"
-   method="GET"
-   action="http://www.hotbot.com/default.asp"
-   update="http://www.hotbot.com/mozilla/hotbot.src"
-   updateCheckDays=1   
->
-
-<input name="query" user>
-<input name="prov" value="Inktomi">
-<input name="tab" value="web">
-<inputnext name="start" factor="10">
-<inputprev name="start" factor="10">
-
-<interpret 
-    resultListStart="<!-- RESULTS -->" 
-    resultListEnd="<!-- /RESULTS -->" 
-
-    resultItemStart="<!-- IS -->" 
-    resultItemEnd="<!-- /IS -->"
->
-</search>
Index: src/engines/Google.src
===================================================================
--- src/engines/Google.src	(revision 959954)
+++ src/engines/Google.src	(working copy)
@@ -1,24 +0,0 @@
-# Google plugin
-
-<search 
-   name="Google"
-   description="Google Search"
-   method="GET"
-   action="http://www.google.com/search"
-   update="http://www.google.com/mozilla/google.src"
-   updateCheckDays=1   
->
-
-<input name="q" user>
-<input name="sourceid" value="mozilla-search">
-<inputnext name="start" factor="10">
-<inputprev name="start" factor="10">
-
-<interpret 
-    resultListStart="<body" 
-    resultListEnd="</body>" 
-
-    resultItemStart="<p class=g>" 
-    resultItemEnd="<br>"
->
-</search>
Index: src/engines/FAST.src
===================================================================
--- src/engines/FAST.src	(revision 959954)
+++ src/engines/FAST.src	(working copy)
@@ -1,25 +0,0 @@
-# FAST/Hotbot plugin
-
-<search 
-   name="FASTHotbot"
-   description="FAST Search via Hotbot"
-   method="GET"
-   action="http://www.hotbot.com/default.asp"
-   update="http://www.hotbot.com/mozilla/hotbot.src"
-   updateCheckDays=1   
->
-
-<input name="query" user>
-<input name="prov" value="FAST">
-<input name="tab" value="web">
-<inputnext name="start" factor="10">
-<inputprev name="start" factor="10">
-
-<interpret 
-    resultListStart="<!-- RESULTS -->" 
-    resultListEnd="<!-- /RESULTS -->" 
-
-    resultItemStart="<!-- IS -->" 
-    resultItemEnd="<!-- /IS -->"
->
-</search>
Index: src/engines/Altavista.src
===================================================================
--- src/engines/Altavista.src	(revision 959954)
+++ src/engines/Altavista.src	(working copy)
@@ -1,24 +0,0 @@
-# Altavista plugin
-
-<search 
-   name="Altavista"
-   description="Altavista web search"
-   method="GET"
-   action="http://www.altavista.com/web/results"
-   updateCheckDays=1   
->
-
-<input name="q" user>
-<input name="sourceid" value="mozilla-search">
-<inputnext name="start" factor="10">
-<inputprev name="start" factor="10">
-
-<interpret 
-    resultListStart="AltaVista found" 
-    resultListEnd="</body>" 
-
-    resultItemStart="<td valign=top  width=99" 
-    resultItemEnd="<span class=rgy>"
-    extractarg="r"
->
-</search>
Index: src/xmlcatalog/xhtml-special.ent
===================================================================
--- src/xmlcatalog/xhtml-special.ent	(revision 959954)
+++ src/xmlcatalog/xhtml-special.ent	(working copy)
@@ -1,80 +0,0 @@
-<!-- Special characters for XHTML -->
-
-<!-- Character entity set. Typical invocation:
-     <!ENTITY % HTMLspecial PUBLIC
-        "-//W3C//ENTITIES Special for XHTML//EN"
-        "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
-     %HTMLspecial;
--->
-
-<!-- Portions (C) International Organization for Standardization 1986:
-     Permission to copy in any form is granted for use with
-     conforming SGML systems and applications as defined in
-     ISO 8879, provided this notice is included in all copies.
--->
-
-<!-- Relevant ISO entity set is given unless names are newly introduced.
-     New names (i.e., not in ISO 8879 list) do not clash with any
-     existing ISO 8879 entity names. ISO 10646 character numbers
-     are given for each character, in hex. values are decimal
-     conversions of the ISO 10646 values and refer to the document
-     character set. Names are Unicode names. 
--->
-
-<!-- C0 Controls and Basic Latin -->
-<!ENTITY quot    "&#34;"> <!--  quotation mark, U+0022 ISOnum -->
-<!ENTITY amp     "&#38;#38;"> <!--  ampersand, U+0026 ISOnum -->
-<!ENTITY lt      "&#38;#60;"> <!--  less-than sign, U+003C ISOnum -->
-<!ENTITY gt      "&#62;"> <!--  greater-than sign, U+003E ISOnum -->
-<!ENTITY apos	 "&#39;"> <!--  apostrophe = APL quote, U+0027 ISOnum -->
-
-<!-- Latin Extended-A -->
-<!ENTITY OElig   "&#338;"> <!--  latin capital ligature OE,
-                                    U+0152 ISOlat2 -->
-<!ENTITY oelig   "&#339;"> <!--  latin small ligature oe, U+0153 ISOlat2 -->
-<!-- ligature is a misnomer, this is a separate character in some languages -->
-<!ENTITY Scaron  "&#352;"> <!--  latin capital letter S with caron,
-                                    U+0160 ISOlat2 -->
-<!ENTITY scaron  "&#353;"> <!--  latin small letter s with caron,
-                                    U+0161 ISOlat2 -->
-<!ENTITY Yuml    "&#376;"> <!--  latin capital letter Y with diaeresis,
-                                    U+0178 ISOlat2 -->
-
-<!-- Spacing Modifier Letters -->
-<!ENTITY circ    "&#710;"> <!--  modifier letter circumflex accent,
-                                    U+02C6 ISOpub -->
-<!ENTITY tilde   "&#732;"> <!--  small tilde, U+02DC ISOdia -->
-
-<!-- General Punctuation -->
-<!ENTITY ensp    "&#8194;"> <!-- en space, U+2002 ISOpub -->
-<!ENTITY emsp    "&#8195;"> <!-- em space, U+2003 ISOpub -->
-<!ENTITY thinsp  "&#8201;"> <!-- thin space, U+2009 ISOpub -->
-<!ENTITY zwnj    "&#8204;"> <!-- zero width non-joiner,
-                                    U+200C NEW RFC 2070 -->
-<!ENTITY zwj     "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
-<!ENTITY lrm     "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
-<!ENTITY rlm     "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
-<!ENTITY ndash   "&#8211;"> <!-- en dash, U+2013 ISOpub -->
-<!ENTITY mdash   "&#8212;"> <!-- em dash, U+2014 ISOpub -->
-<!ENTITY lsquo   "&#8216;"> <!-- left single quotation mark,
-                                    U+2018 ISOnum -->
-<!ENTITY rsquo   "&#8217;"> <!-- right single quotation mark,
-                                    U+2019 ISOnum -->
-<!ENTITY sbquo   "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
-<!ENTITY ldquo   "&#8220;"> <!-- left double quotation mark,
-                                    U+201C ISOnum -->
-<!ENTITY rdquo   "&#8221;"> <!-- right double quotation mark,
-                                    U+201D ISOnum -->
-<!ENTITY bdquo   "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
-<!ENTITY dagger  "&#8224;"> <!-- dagger, U+2020 ISOpub -->
-<!ENTITY Dagger  "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
-<!ENTITY permil  "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
-<!ENTITY lsaquo  "&#8249;"> <!-- single left-pointing angle quotation mark,
-                                    U+2039 ISO proposed -->
-<!-- lsaquo is proposed but not yet ISO standardized -->
-<!ENTITY rsaquo  "&#8250;"> <!-- single right-pointing angle quotation mark,
-                                    U+203A ISO proposed -->
-<!-- rsaquo is proposed but not yet ISO standardized -->
-
-<!-- Currency Symbols -->
-<!ENTITY euro   "&#8364;"> <!--  euro sign, U+20AC NEW -->
Index: src/xmlcatalog/xhtml-lat1.ent
===================================================================
--- src/xmlcatalog/xhtml-lat1.ent	(revision 959954)
+++ src/xmlcatalog/xhtml-lat1.ent	(working copy)
@@ -1,196 +0,0 @@
-<!-- Portions (C) International Organization for Standardization 1986
-     Permission to copy in any form is granted for use with
-     conforming SGML systems and applications as defined in
-     ISO 8879, provided this notice is included in all copies.
--->
-<!-- Character entity set. Typical invocation:
-    <!ENTITY % HTMLlat1 PUBLIC
-       "-//W3C//ENTITIES Latin 1 for XHTML//EN"
-       "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">
-    %HTMLlat1;
--->
-
-<!ENTITY nbsp   "&#160;"> <!-- no-break space = non-breaking space,
-                                  U+00A0 ISOnum -->
-<!ENTITY iexcl  "&#161;"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
-<!ENTITY cent   "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
-<!ENTITY pound  "&#163;"> <!-- pound sign, U+00A3 ISOnum -->
-<!ENTITY curren "&#164;"> <!-- currency sign, U+00A4 ISOnum -->
-<!ENTITY yen    "&#165;"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
-<!ENTITY brvbar "&#166;"> <!-- broken bar = broken vertical bar,
-                                  U+00A6 ISOnum -->
-<!ENTITY sect   "&#167;"> <!-- section sign, U+00A7 ISOnum -->
-<!ENTITY uml    "&#168;"> <!-- diaeresis = spacing diaeresis,
-                                  U+00A8 ISOdia -->
-<!ENTITY copy   "&#169;"> <!-- copyright sign, U+00A9 ISOnum -->
-<!ENTITY ordf   "&#170;"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
-<!ENTITY laquo  "&#171;"> <!-- left-pointing double angle quotation mark
-                                  = left pointing guillemet, U+00AB ISOnum -->
-<!ENTITY not    "&#172;"> <!-- not sign = angled dash,
-                                  U+00AC ISOnum -->
-<!ENTITY shy    "&#173;"> <!-- soft hyphen = discretionary hyphen,
-                                  U+00AD ISOnum -->
-<!ENTITY reg    "&#174;"> <!-- registered sign = registered trade mark sign,
-                                  U+00AE ISOnum -->
-<!ENTITY macr   "&#175;"> <!-- macron = spacing macron = overline
-                                  = APL overbar, U+00AF ISOdia -->
-<!ENTITY deg    "&#176;"> <!-- degree sign, U+00B0 ISOnum -->
-<!ENTITY plusmn "&#177;"> <!-- plus-minus sign = plus-or-minus sign,
-                                  U+00B1 ISOnum -->
-<!ENTITY sup2   "&#178;"> <!-- superscript two = superscript digit two
-                                  = squared, U+00B2 ISOnum -->
-<!ENTITY sup3   "&#179;"> <!-- superscript three = superscript digit three
-                                  = cubed, U+00B3 ISOnum -->
-<!ENTITY acute  "&#180;"> <!-- acute accent = spacing acute,
-                                  U+00B4 ISOdia -->
-<!ENTITY micro  "&#181;"> <!-- micro sign, U+00B5 ISOnum -->
-<!ENTITY para   "&#182;"> <!-- pilcrow sign = paragraph sign,
-                                  U+00B6 ISOnum -->
-<!ENTITY middot "&#183;"> <!-- middle dot = Georgian comma
-                                  = Greek middle dot, U+00B7 ISOnum -->
-<!ENTITY cedil  "&#184;"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
-<!ENTITY sup1   "&#185;"> <!-- superscript one = superscript digit one,
-                                  U+00B9 ISOnum -->
-<!ENTITY ordm   "&#186;"> <!-- masculine ordinal indicator,
-                                  U+00BA ISOnum -->
-<!ENTITY raquo  "&#187;"> <!-- right-pointing double angle quotation mark
-                                  = right pointing guillemet, U+00BB ISOnum -->
-<!ENTITY frac14 "&#188;"> <!-- vulgar fraction one quarter
-                                  = fraction one quarter, U+00BC ISOnum -->
-<!ENTITY frac12 "&#189;"> <!-- vulgar fraction one half
-                                  = fraction one half, U+00BD ISOnum -->
-<!ENTITY frac34 "&#190;"> <!-- vulgar fraction three quarters
-                                  = fraction three quarters, U+00BE ISOnum -->
-<!ENTITY iquest "&#191;"> <!-- inverted question mark
-                                  = turned question mark, U+00BF ISOnum -->
-<!ENTITY Agrave "&#192;"> <!-- latin capital letter A with grave
-                                  = latin capital letter A grave,
-                                  U+00C0 ISOlat1 -->
-<!ENTITY Aacute "&#193;"> <!-- latin capital letter A with acute,
-                                  U+00C1 ISOlat1 -->
-<!ENTITY Acirc  "&#194;"> <!-- latin capital letter A with circumflex,
-                                  U+00C2 ISOlat1 -->
-<!ENTITY Atilde "&#195;"> <!-- latin capital letter A with tilde,
-                                  U+00C3 ISOlat1 -->
-<!ENTITY Auml   "&#196;"> <!-- latin capital letter A with diaeresis,
-                                  U+00C4 ISOlat1 -->
-<!ENTITY Aring  "&#197;"> <!-- latin capital letter A with ring above
-                                  = latin capital letter A ring,
-                                  U+00C5 ISOlat1 -->
-<!ENTITY AElig  "&#198;"> <!-- latin capital letter AE
-                                  = latin capital ligature AE,
-                                  U+00C6 ISOlat1 -->
-<!ENTITY Ccedil "&#199;"> <!-- latin capital letter C with cedilla,
-                                  U+00C7 ISOlat1 -->
-<!ENTITY Egrave "&#200;"> <!-- latin capital letter E with grave,
-                                  U+00C8 ISOlat1 -->
-<!ENTITY Eacute "&#201;"> <!-- latin capital letter E with acute,
-                                  U+00C9 ISOlat1 -->
-<!ENTITY Ecirc  "&#202;"> <!-- latin capital letter E with circumflex,
-                                  U+00CA ISOlat1 -->
-<!ENTITY Euml   "&#203;"> <!-- latin capital letter E with diaeresis,
-                                  U+00CB ISOlat1 -->
-<!ENTITY Igrave "&#204;"> <!-- latin capital letter I with grave,
-                                  U+00CC ISOlat1 -->
-<!ENTITY Iacute "&#205;"> <!-- latin capital letter I with acute,
-                                  U+00CD ISOlat1 -->
-<!ENTITY Icirc  "&#206;"> <!-- latin capital letter I with circumflex,
-                                  U+00CE ISOlat1 -->
-<!ENTITY Iuml   "&#207;"> <!-- latin capital letter I with diaeresis,
-                                  U+00CF ISOlat1 -->
-<!ENTITY ETH    "&#208;"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
-<!ENTITY Ntilde "&#209;"> <!-- latin capital letter N with tilde,
-                                  U+00D1 ISOlat1 -->
-<!ENTITY Ograve "&#210;"> <!-- latin capital letter O with grave,
-                                  U+00D2 ISOlat1 -->
-<!ENTITY Oacute "&#211;"> <!-- latin capital letter O with acute,
-                                  U+00D3 ISOlat1 -->
-<!ENTITY Ocirc  "&#212;"> <!-- latin capital letter O with circumflex,
-                                  U+00D4 ISOlat1 -->
-<!ENTITY Otilde "&#213;"> <!-- latin capital letter O with tilde,
-                                  U+00D5 ISOlat1 -->
-<!ENTITY Ouml   "&#214;"> <!-- latin capital letter O with diaeresis,
-                                  U+00D6 ISOlat1 -->
-<!ENTITY times  "&#215;"> <!-- multiplication sign, U+00D7 ISOnum -->
-<!ENTITY Oslash "&#216;"> <!-- latin capital letter O with stroke
-                                  = latin capital letter O slash,
-                                  U+00D8 ISOlat1 -->
-<!ENTITY Ugrave "&#217;"> <!-- latin capital letter U with grave,
-                                  U+00D9 ISOlat1 -->
-<!ENTITY Uacute "&#218;"> <!-- latin capital letter U with acute,
-                                  U+00DA ISOlat1 -->
-<!ENTITY Ucirc  "&#219;"> <!-- latin capital letter U with circumflex,
-                                  U+00DB ISOlat1 -->
-<!ENTITY Uuml   "&#220;"> <!-- latin capital letter U with diaeresis,
-                                  U+00DC ISOlat1 -->
-<!ENTITY Yacute "&#221;"> <!-- latin capital letter Y with acute,
-                                  U+00DD ISOlat1 -->
-<!ENTITY THORN  "&#222;"> <!-- latin capital letter THORN,
-                                  U+00DE ISOlat1 -->
-<!ENTITY szlig  "&#223;"> <!-- latin small letter sharp s = ess-zed,
-                                  U+00DF ISOlat1 -->
-<!ENTITY agrave "&#224;"> <!-- latin small letter a with grave
-                                  = latin small letter a grave,
-                                  U+00E0 ISOlat1 -->
-<!ENTITY aacute "&#225;"> <!-- latin small letter a with acute,
-                                  U+00E1 ISOlat1 -->
-<!ENTITY acirc  "&#226;"> <!-- latin small letter a with circumflex,
-                                  U+00E2 ISOlat1 -->
-<!ENTITY atilde "&#227;"> <!-- latin small letter a with tilde,
-                                  U+00E3 ISOlat1 -->
-<!ENTITY auml   "&#228;"> <!-- latin small letter a with diaeresis,
-                                  U+00E4 ISOlat1 -->
-<!ENTITY aring  "&#229;"> <!-- latin small letter a with ring above
-                                  = latin small letter a ring,
-                                  U+00E5 ISOlat1 -->
-<!ENTITY aelig  "&#230;"> <!-- latin small letter ae
-                                  = latin small ligature ae, U+00E6 ISOlat1 -->
-<!ENTITY ccedil "&#231;"> <!-- latin small letter c with cedilla,
-                                  U+00E7 ISOlat1 -->
-<!ENTITY egrave "&#232;"> <!-- latin small letter e with grave,
-                                  U+00E8 ISOlat1 -->
-<!ENTITY eacute "&#233;"> <!-- latin small letter e with acute,
-                                  U+00E9 ISOlat1 -->
-<!ENTITY ecirc  "&#234;"> <!-- latin small letter e with circumflex,
-                                  U+00EA ISOlat1 -->
-<!ENTITY euml   "&#235;"> <!-- latin small letter e with diaeresis,
-                                  U+00EB ISOlat1 -->
-<!ENTITY igrave "&#236;"> <!-- latin small letter i with grave,
-                                  U+00EC ISOlat1 -->
-<!ENTITY iacute "&#237;"> <!-- latin small letter i with acute,
-                                  U+00ED ISOlat1 -->
-<!ENTITY icirc  "&#238;"> <!-- latin small letter i with circumflex,
-                                  U+00EE ISOlat1 -->
-<!ENTITY iuml   "&#239;"> <!-- latin small letter i with diaeresis,
-                                  U+00EF ISOlat1 -->
-<!ENTITY eth    "&#240;"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
-<!ENTITY ntilde "&#241;"> <!-- latin small letter n with tilde,
-                                  U+00F1 ISOlat1 -->
-<!ENTITY ograve "&#242;"> <!-- latin small letter o with grave,
-                                  U+00F2 ISOlat1 -->
-<!ENTITY oacute "&#243;"> <!-- latin small letter o with acute,
-                                  U+00F3 ISOlat1 -->
-<!ENTITY ocirc  "&#244;"> <!-- latin small letter o with circumflex,
-                                  U+00F4 ISOlat1 -->
-<!ENTITY otilde "&#245;"> <!-- latin small letter o with tilde,
-                                  U+00F5 ISOlat1 -->
-<!ENTITY ouml   "&#246;"> <!-- latin small letter o with diaeresis,
-                                  U+00F6 ISOlat1 -->
-<!ENTITY divide "&#247;"> <!-- division sign, U+00F7 ISOnum -->
-<!ENTITY oslash "&#248;"> <!-- latin small letter o with stroke,
-                                  = latin small letter o slash,
-                                  U+00F8 ISOlat1 -->
-<!ENTITY ugrave "&#249;"> <!-- latin small letter u with grave,
-                                  U+00F9 ISOlat1 -->
-<!ENTITY uacute "&#250;"> <!-- latin small letter u with acute,
-                                  U+00FA ISOlat1 -->
-<!ENTITY ucirc  "&#251;"> <!-- latin small letter u with circumflex,
-                                  U+00FB ISOlat1 -->
-<!ENTITY uuml   "&#252;"> <!-- latin small letter u with diaeresis,
-                                  U+00FC ISOlat1 -->
-<!ENTITY yacute "&#253;"> <!-- latin small letter y with acute,
-                                  U+00FD ISOlat1 -->
-<!ENTITY thorn  "&#254;"> <!-- latin small letter thorn,
-                                  U+00FE ISOlat1 -->
-<!ENTITY yuml   "&#255;"> <!-- latin small letter y with diaeresis,
-                                  U+00FF ISOlat1 -->
Index: src/xmlcatalog/xhtml-symbol.ent
===================================================================
--- src/xmlcatalog/xhtml-symbol.ent	(revision 959954)
+++ src/xmlcatalog/xhtml-symbol.ent	(working copy)
@@ -1,237 +0,0 @@
-<!-- Mathematical, Greek and Symbolic characters for XHTML -->
-
-<!-- Character entity set. Typical invocation:
-     <!ENTITY % HTMLsymbol PUBLIC
-        "-//W3C//ENTITIES Symbols for XHTML//EN"
-        "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent">
-     %HTMLsymbol;
--->
-
-<!-- Portions (C) International Organization for Standardization 1986:
-     Permission to copy in any form is granted for use with
-     conforming SGML systems and applications as defined in
-     ISO 8879, provided this notice is included in all copies.
--->
-
-<!-- Relevant ISO entity set is given unless names are newly introduced.
-     New names (i.e., not in ISO 8879 list) do not clash with any
-     existing ISO 8879 entity names. ISO 10646 character numbers
-     are given for each character, in hex. values are decimal
-     conversions of the ISO 10646 values and refer to the document
-     character set. Names are Unicode names. 
--->
-
-<!-- Latin Extended-B -->
-<!ENTITY fnof     "&#402;"> <!-- latin small letter f with hook = function
-                                    = florin, U+0192 ISOtech -->
-
-<!-- Greek -->
-<!ENTITY Alpha    "&#913;"> <!-- greek capital letter alpha, U+0391 -->
-<!ENTITY Beta     "&#914;"> <!-- greek capital letter beta, U+0392 -->
-<!ENTITY Gamma    "&#915;"> <!-- greek capital letter gamma,
-                                    U+0393 ISOgrk3 -->
-<!ENTITY Delta    "&#916;"> <!-- greek capital letter delta,
-                                    U+0394 ISOgrk3 -->
-<!ENTITY Epsilon  "&#917;"> <!-- greek capital letter epsilon, U+0395 -->
-<!ENTITY Zeta     "&#918;"> <!-- greek capital letter zeta, U+0396 -->
-<!ENTITY Eta      "&#919;"> <!-- greek capital letter eta, U+0397 -->
-<!ENTITY Theta    "&#920;"> <!-- greek capital letter theta,
-                                    U+0398 ISOgrk3 -->
-<!ENTITY Iota     "&#921;"> <!-- greek capital letter iota, U+0399 -->
-<!ENTITY Kappa    "&#922;"> <!-- greek capital letter kappa, U+039A -->
-<!ENTITY Lambda   "&#923;"> <!-- greek capital letter lamda,
-                                    U+039B ISOgrk3 -->
-<!ENTITY Mu       "&#924;"> <!-- greek capital letter mu, U+039C -->
-<!ENTITY Nu       "&#925;"> <!-- greek capital letter nu, U+039D -->
-<!ENTITY Xi       "&#926;"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
-<!ENTITY Omicron  "&#927;"> <!-- greek capital letter omicron, U+039F -->
-<!ENTITY Pi       "&#928;"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
-<!ENTITY Rho      "&#929;"> <!-- greek capital letter rho, U+03A1 -->
-<!-- there is no Sigmaf, and no U+03A2 character either -->
-<!ENTITY Sigma    "&#931;"> <!-- greek capital letter sigma,
-                                    U+03A3 ISOgrk3 -->
-<!ENTITY Tau      "&#932;"> <!-- greek capital letter tau, U+03A4 -->
-<!ENTITY Upsilon  "&#933;"> <!-- greek capital letter upsilon,
-                                    U+03A5 ISOgrk3 -->
-<!ENTITY Phi      "&#934;"> <!-- greek capital letter phi,
-                                    U+03A6 ISOgrk3 -->
-<!ENTITY Chi      "&#935;"> <!-- greek capital letter chi, U+03A7 -->
-<!ENTITY Psi      "&#936;"> <!-- greek capital letter psi,
-                                    U+03A8 ISOgrk3 -->
-<!ENTITY Omega    "&#937;"> <!-- greek capital letter omega,
-                                    U+03A9 ISOgrk3 -->
-
-<!ENTITY alpha    "&#945;"> <!-- greek small letter alpha,
-                                    U+03B1 ISOgrk3 -->
-<!ENTITY beta     "&#946;"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
-<!ENTITY gamma    "&#947;"> <!-- greek small letter gamma,
-                                    U+03B3 ISOgrk3 -->
-<!ENTITY delta    "&#948;"> <!-- greek small letter delta,
-                                    U+03B4 ISOgrk3 -->
-<!ENTITY epsilon  "&#949;"> <!-- greek small letter epsilon,
-                                    U+03B5 ISOgrk3 -->
-<!ENTITY zeta     "&#950;"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
-<!ENTITY eta      "&#951;"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
-<!ENTITY theta    "&#952;"> <!-- greek small letter theta,
-                                    U+03B8 ISOgrk3 -->
-<!ENTITY iota     "&#953;"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
-<!ENTITY kappa    "&#954;"> <!-- greek small letter kappa,
-                                    U+03BA ISOgrk3 -->
-<!ENTITY lambda   "&#955;"> <!-- greek small letter lamda,
-                                    U+03BB ISOgrk3 -->
-<!ENTITY mu       "&#956;"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
-<!ENTITY nu       "&#957;"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
-<!ENTITY xi       "&#958;"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
-<!ENTITY omicron  "&#959;"> <!-- greek small letter omicron, U+03BF NEW -->
-<!ENTITY pi       "&#960;"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
-<!ENTITY rho      "&#961;"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
-<!ENTITY sigmaf   "&#962;"> <!-- greek small letter final sigma,
-                                    U+03C2 ISOgrk3 -->
-<!ENTITY sigma    "&#963;"> <!-- greek small letter sigma,
-                                    U+03C3 ISOgrk3 -->
-<!ENTITY tau      "&#964;"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
-<!ENTITY upsilon  "&#965;"> <!-- greek small letter upsilon,
-                                    U+03C5 ISOgrk3 -->
-<!ENTITY phi      "&#966;"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
-<!ENTITY chi      "&#967;"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
-<!ENTITY psi      "&#968;"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
-<!ENTITY omega    "&#969;"> <!-- greek small letter omega,
-                                    U+03C9 ISOgrk3 -->
-<!ENTITY thetasym "&#977;"> <!-- greek theta symbol,
-                                    U+03D1 NEW -->
-<!ENTITY upsih    "&#978;"> <!-- greek upsilon with hook symbol,
-                                    U+03D2 NEW -->
-<!ENTITY piv      "&#982;"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
-
-<!-- General Punctuation -->
-<!ENTITY bull     "&#8226;"> <!-- bullet = black small circle,
-                                     U+2022 ISOpub  -->
-<!-- bullet is NOT the same as bullet operator, U+2219 -->
-<!ENTITY hellip   "&#8230;"> <!-- horizontal ellipsis = three dot leader,
-                                     U+2026 ISOpub  -->
-<!ENTITY prime    "&#8242;"> <!-- prime = minutes = feet, U+2032 ISOtech -->
-<!ENTITY Prime    "&#8243;"> <!-- double prime = seconds = inches,
-                                     U+2033 ISOtech -->
-<!ENTITY oline    "&#8254;"> <!-- overline = spacing overscore,
-                                     U+203E NEW -->
-<!ENTITY frasl    "&#8260;"> <!-- fraction slash, U+2044 NEW -->
-
-<!-- Letterlike Symbols -->
-<!ENTITY weierp   "&#8472;"> <!-- script capital P = power set
-                                     = Weierstrass p, U+2118 ISOamso -->
-<!ENTITY image    "&#8465;"> <!-- black-letter capital I = imaginary part,
-                                     U+2111 ISOamso -->
-<!ENTITY real     "&#8476;"> <!-- black-letter capital R = real part symbol,
-                                     U+211C ISOamso -->
-<!ENTITY trade    "&#8482;"> <!-- trade mark sign, U+2122 ISOnum -->
-<!ENTITY alefsym  "&#8501;"> <!-- alef symbol = first transfinite cardinal,
-                                     U+2135 NEW -->
-<!-- alef symbol is NOT the same as hebrew letter alef,
-     U+05D0 although the same glyph could be used to depict both characters -->
-
-<!-- Arrows -->
-<!ENTITY larr     "&#8592;"> <!-- leftwards arrow, U+2190 ISOnum -->
-<!ENTITY uarr     "&#8593;"> <!-- upwards arrow, U+2191 ISOnum-->
-<!ENTITY rarr     "&#8594;"> <!-- rightwards arrow, U+2192 ISOnum -->
-<!ENTITY darr     "&#8595;"> <!-- downwards arrow, U+2193 ISOnum -->
-<!ENTITY harr     "&#8596;"> <!-- left right arrow, U+2194 ISOamsa -->
-<!ENTITY crarr    "&#8629;"> <!-- downwards arrow with corner leftwards
-                                     = carriage return, U+21B5 NEW -->
-<!ENTITY lArr     "&#8656;"> <!-- leftwards double arrow, U+21D0 ISOtech -->
-<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
-    but also does not have any other character for that function. So lArr can
-    be used for 'is implied by' as ISOtech suggests -->
-<!ENTITY uArr     "&#8657;"> <!-- upwards double arrow, U+21D1 ISOamsa -->
-<!ENTITY rArr     "&#8658;"> <!-- rightwards double arrow,
-                                     U+21D2 ISOtech -->
-<!-- Unicode does not say this is the 'implies' character but does not have 
-     another character with this function so rArr can be used for 'implies'
-     as ISOtech suggests -->
-<!ENTITY dArr     "&#8659;"> <!-- downwards double arrow, U+21D3 ISOamsa -->
-<!ENTITY hArr     "&#8660;"> <!-- left right double arrow,
-                                     U+21D4 ISOamsa -->
-
-<!-- Mathematical Operators -->
-<!ENTITY forall   "&#8704;"> <!-- for all, U+2200 ISOtech -->
-<!ENTITY part     "&#8706;"> <!-- partial differential, U+2202 ISOtech  -->
-<!ENTITY exist    "&#8707;"> <!-- there exists, U+2203 ISOtech -->
-<!ENTITY empty    "&#8709;"> <!-- empty set = null set, U+2205 ISOamso -->
-<!ENTITY nabla    "&#8711;"> <!-- nabla = backward difference,
-                                     U+2207 ISOtech -->
-<!ENTITY isin     "&#8712;"> <!-- element of, U+2208 ISOtech -->
-<!ENTITY notin    "&#8713;"> <!-- not an element of, U+2209 ISOtech -->
-<!ENTITY ni       "&#8715;"> <!-- contains as member, U+220B ISOtech -->
-<!ENTITY prod     "&#8719;"> <!-- n-ary product = product sign,
-                                     U+220F ISOamsb -->
-<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
-     the same glyph might be used for both -->
-<!ENTITY sum      "&#8721;"> <!-- n-ary summation, U+2211 ISOamsb -->
-<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
-     though the same glyph might be used for both -->
-<!ENTITY minus    "&#8722;"> <!-- minus sign, U+2212 ISOtech -->
-<!ENTITY lowast   "&#8727;"> <!-- asterisk operator, U+2217 ISOtech -->
-<!ENTITY radic    "&#8730;"> <!-- square root = radical sign,
-                                     U+221A ISOtech -->
-<!ENTITY prop     "&#8733;"> <!-- proportional to, U+221D ISOtech -->
-<!ENTITY infin    "&#8734;"> <!-- infinity, U+221E ISOtech -->
-<!ENTITY ang      "&#8736;"> <!-- angle, U+2220 ISOamso -->
-<!ENTITY and      "&#8743;"> <!-- logical and = wedge, U+2227 ISOtech -->
-<!ENTITY or       "&#8744;"> <!-- logical or = vee, U+2228 ISOtech -->
-<!ENTITY cap      "&#8745;"> <!-- intersection = cap, U+2229 ISOtech -->
-<!ENTITY cup      "&#8746;"> <!-- union = cup, U+222A ISOtech -->
-<!ENTITY int      "&#8747;"> <!-- integral, U+222B ISOtech -->
-<!ENTITY there4   "&#8756;"> <!-- therefore, U+2234 ISOtech -->
-<!ENTITY sim      "&#8764;"> <!-- tilde operator = varies with = similar to,
-                                     U+223C ISOtech -->
-<!-- tilde operator is NOT the same character as the tilde, U+007E,
-     although the same glyph might be used to represent both  -->
-<!ENTITY cong     "&#8773;"> <!-- approximately equal to, U+2245 ISOtech -->
-<!ENTITY asymp    "&#8776;"> <!-- almost equal to = asymptotic to,
-                                     U+2248 ISOamsr -->
-<!ENTITY ne       "&#8800;"> <!-- not equal to, U+2260 ISOtech -->
-<!ENTITY equiv    "&#8801;"> <!-- identical to, U+2261 ISOtech -->
-<!ENTITY le       "&#8804;"> <!-- less-than or equal to, U+2264 ISOtech -->
-<!ENTITY ge       "&#8805;"> <!-- greater-than or equal to,
-                                     U+2265 ISOtech -->
-<!ENTITY sub      "&#8834;"> <!-- subset of, U+2282 ISOtech -->
-<!ENTITY sup      "&#8835;"> <!-- superset of, U+2283 ISOtech -->
-<!ENTITY nsub     "&#8836;"> <!-- not a subset of, U+2284 ISOamsn -->
-<!ENTITY sube     "&#8838;"> <!-- subset of or equal to, U+2286 ISOtech -->
-<!ENTITY supe     "&#8839;"> <!-- superset of or equal to,
-                                     U+2287 ISOtech -->
-<!ENTITY oplus    "&#8853;"> <!-- circled plus = direct sum,
-                                     U+2295 ISOamsb -->
-<!ENTITY otimes   "&#8855;"> <!-- circled times = vector product,
-                                     U+2297 ISOamsb -->
-<!ENTITY perp     "&#8869;"> <!-- up tack = orthogonal to = perpendicular,
-                                     U+22A5 ISOtech -->
-<!ENTITY sdot     "&#8901;"> <!-- dot operator, U+22C5 ISOamsb -->
-<!-- dot operator is NOT the same character as U+00B7 middle dot -->
-
-<!-- Miscellaneous Technical -->
-<!ENTITY lceil    "&#8968;"> <!-- left ceiling = APL upstile,
-                                     U+2308 ISOamsc  -->
-<!ENTITY rceil    "&#8969;"> <!-- right ceiling, U+2309 ISOamsc  -->
-<!ENTITY lfloor   "&#8970;"> <!-- left floor = APL downstile,
-                                     U+230A ISOamsc  -->
-<!ENTITY rfloor   "&#8971;"> <!-- right floor, U+230B ISOamsc  -->
-<!ENTITY lang     "&#9001;"> <!-- left-pointing angle bracket = bra,
-                                     U+2329 ISOtech -->
-<!-- lang is NOT the same character as U+003C 'less than sign' 
-     or U+2039 'single left-pointing angle quotation mark' -->
-<!ENTITY rang     "&#9002;"> <!-- right-pointing angle bracket = ket,
-                                     U+232A ISOtech -->
-<!-- rang is NOT the same character as U+003E 'greater than sign' 
-     or U+203A 'single right-pointing angle quotation mark' -->
-
-<!-- Geometric Shapes -->
-<!ENTITY loz      "&#9674;"> <!-- lozenge, U+25CA ISOpub -->
-
-<!-- Miscellaneous Symbols -->
-<!ENTITY spades   "&#9824;"> <!-- black spade suit, U+2660 ISOpub -->
-<!-- black here seems to mean filled as opposed to hollow -->
-<!ENTITY clubs    "&#9827;"> <!-- black club suit = shamrock,
-                                     U+2663 ISOpub -->
-<!ENTITY hearts   "&#9829;"> <!-- black heart suit = valentine,
-                                     U+2665 ISOpub -->
-<!ENTITY diams    "&#9830;"> <!-- black diamond suit, U+2666 ISOpub -->
Index: src/xmlcatalog/xhtml1-transitional.dtd
===================================================================
--- src/xmlcatalog/xhtml1-transitional.dtd	(revision 959954)
+++ src/xmlcatalog/xhtml1-transitional.dtd	(working copy)
@@ -1,1201 +0,0 @@
-<!--
-   Extensible HTML version 1.0 Transitional DTD
-
-   This is the same as HTML 4 Transitional except for
-   changes due to the differences between XML and SGML.
-
-   Namespace = http://www.w3.org/1999/xhtml
-
-   For further information, see: http://www.w3.org/TR/xhtml1
-
-   Copyright (c) 1998-2002 W3C (MIT, INRIA, Keio),
-   All Rights Reserved. 
-
-   This DTD module is identified by the PUBLIC and SYSTEM identifiers:
-
-   PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-   SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
-
-   $Revision: 1.2 $
-   $Date: 2002/08/01 18:37:55 $
-
--->
-
-<!--================ Character mnemonic entities =========================-->
-
-<!ENTITY % HTMLlat1 PUBLIC
-   "-//W3C//ENTITIES Latin 1 for XHTML//EN"
-   "xhtml-lat1.ent">
-%HTMLlat1;
-
-<!ENTITY % HTMLsymbol PUBLIC
-   "-//W3C//ENTITIES Symbols for XHTML//EN"
-   "xhtml-symbol.ent">
-%HTMLsymbol;
-
-<!ENTITY % HTMLspecial PUBLIC
-   "-//W3C//ENTITIES Special for XHTML//EN"
-   "xhtml-special.ent">
-%HTMLspecial;
-
-<!--================== Imported Names ====================================-->
-
-<!ENTITY % ContentType "CDATA">
-    <!-- media type, as per [RFC2045] -->
-
-<!ENTITY % ContentTypes "CDATA">
-    <!-- comma-separated list of media types, as per [RFC2045] -->
-
-<!ENTITY % Charset "CDATA">
-    <!-- a character encoding, as per [RFC2045] -->
-
-<!ENTITY % Charsets "CDATA">
-    <!-- a space separated list of character encodings, as per [RFC2045] -->
-
-<!ENTITY % LanguageCode "NMTOKEN">
-    <!-- a language code, as per [RFC3066] -->
-
-<!ENTITY % Character "CDATA">
-    <!-- a single character, as per section 2.2 of [XML] -->
-
-<!ENTITY % Number "CDATA">
-    <!-- one or more digits -->
-
-<!ENTITY % LinkTypes "CDATA">
-    <!-- space-separated list of link types -->
-
-<!ENTITY % MediaDesc "CDATA">
-    <!-- single or comma-separated list of media descriptors -->
-
-<!ENTITY % URI "CDATA">
-    <!-- a Uniform Resource Identifier, see [RFC2396] -->
-
-<!ENTITY % UriList "CDATA">
-    <!-- a space separated list of Uniform Resource Identifiers -->
-
-<!ENTITY % Datetime "CDATA">
-    <!-- date and time information. ISO date format -->
-
-<!ENTITY % Script "CDATA">
-    <!-- script expression -->
-
-<!ENTITY % StyleSheet "CDATA">
-    <!-- style sheet data -->
-
-<!ENTITY % Text "CDATA">
-    <!-- used for titles etc. -->
-
-<!ENTITY % FrameTarget "NMTOKEN">
-    <!-- render in this frame -->
-
-<!ENTITY % Length "CDATA">
-    <!-- nn for pixels or nn% for percentage length -->
-
-<!ENTITY % MultiLength "CDATA">
-    <!-- pixel, percentage, or relative -->
-
-<!ENTITY % Pixels "CDATA">
-    <!-- integer representing length in pixels -->
-
-<!-- these are used for image maps -->
-
-<!ENTITY % Shape "(rect|circle|poly|default)">
-
-<!ENTITY % Coords "CDATA">
-    <!-- comma separated list of lengths -->
-
-<!-- used for object, applet, img, input and iframe -->
-<!ENTITY % ImgAlign "(top|middle|bottom|left|right)">
-
-<!-- a color using sRGB: #RRGGBB as Hex values -->
-<!ENTITY % Color "CDATA">
-
-<!-- There are also 16 widely known color names with their sRGB values:
-
-    Black  = #000000    Green  = #008000
-    Silver = #C0C0C0    Lime   = #00FF00
-    Gray   = #808080    Olive  = #808000
-    White  = #FFFFFF    Yellow = #FFFF00
-    Maroon = #800000    Navy   = #000080
-    Red    = #FF0000    Blue   = #0000FF
-    Purple = #800080    Teal   = #008080
-    Fuchsia= #FF00FF    Aqua   = #00FFFF
--->
-
-<!--=================== Generic Attributes ===============================-->
-
-<!-- core attributes common to most elements
-  id       document-wide unique id
-  class    space separated list of classes
-  style    associated style info
-  title    advisory title/amplification
--->
-<!ENTITY % coreattrs
- "id          ID             #IMPLIED
-  class       CDATA          #IMPLIED
-  style       %StyleSheet;   #IMPLIED
-  title       %Text;         #IMPLIED"
-  >
-
-<!-- internationalization attributes
-  lang        language code (backwards compatible)
-  xml:lang    language code (as per XML 1.0 spec)
-  dir         direction for weak/neutral text
--->
-<!ENTITY % i18n
- "lang        %LanguageCode; #IMPLIED
-  xml:lang    %LanguageCode; #IMPLIED
-  dir         (ltr|rtl)      #IMPLIED"
-  >
-
-<!-- attributes for common UI events
-  onclick     a pointer button was clicked
-  ondblclick  a pointer button was double clicked
-  onmousedown a pointer button was pressed down
-  onmouseup   a pointer button was released
-  onmousemove a pointer was moved onto the element
-  onmouseout  a pointer was moved away from the element
-  onkeypress  a key was pressed and released
-  onkeydown   a key was pressed down
-  onkeyup     a key was released
--->
-<!ENTITY % events
- "onclick     %Script;       #IMPLIED
-  ondblclick  %Script;       #IMPLIED
-  onmousedown %Script;       #IMPLIED
-  onmouseup   %Script;       #IMPLIED
-  onmouseover %Script;       #IMPLIED
-  onmousemove %Script;       #IMPLIED
-  onmouseout  %Script;       #IMPLIED
-  onkeypress  %Script;       #IMPLIED
-  onkeydown   %Script;       #IMPLIED
-  onkeyup     %Script;       #IMPLIED"
-  >
-
-<!-- attributes for elements that can get the focus
-  accesskey   accessibility key character
-  tabindex    position in tabbing order
-  onfocus     the element got the focus
-  onblur      the element lost the focus
--->
-<!ENTITY % focus
- "accesskey   %Character;    #IMPLIED
-  tabindex    %Number;       #IMPLIED
-  onfocus     %Script;       #IMPLIED
-  onblur      %Script;       #IMPLIED"
-  >
-
-<!ENTITY % attrs "%coreattrs; %i18n; %events;">
-
-<!-- text alignment for p, div, h1-h6. The default is
-     align="left" for ltr headings, "right" for rtl -->
-
-<!ENTITY % TextAlign "align (left|center|right|justify) #IMPLIED">
-
-<!--=================== Text Elements ====================================-->
-
-<!ENTITY % special.extra
-   "object | applet | img | map | iframe">
-	
-<!ENTITY % special.basic
-	"br | span | bdo">
-
-<!ENTITY % special
-   "%special.basic; | %special.extra;">
-
-<!ENTITY % fontstyle.extra "big | small | font | basefont">
-
-<!ENTITY % fontstyle.basic "tt | i | b | u
-                      | s | strike ">
-
-<!ENTITY % fontstyle "%fontstyle.basic; | %fontstyle.extra;">
-
-<!ENTITY % phrase.extra "sub | sup">
-<!ENTITY % phrase.basic "em | strong | dfn | code | q |
-                   samp | kbd | var | cite | abbr | acronym">
-
-<!ENTITY % phrase "%phrase.basic; | %phrase.extra;">
-
-<!ENTITY % inline.forms "input | select | textarea | label | button">
-
-<!-- these can occur at block or inline level -->
-<!ENTITY % misc.inline "ins | del | script">
-
-<!-- these can only occur at block level -->
-<!ENTITY % misc "noscript | %misc.inline;">
-
-<!ENTITY % inline "a | %special; | %fontstyle; | %phrase; | %inline.forms;">
-
-<!-- %Inline; covers inline or "text-level" elements -->
-<!ENTITY % Inline "(#PCDATA | %inline; | %misc.inline;)*">
-
-<!--================== Block level elements ==============================-->
-
-<!ENTITY % heading "h1|h2|h3|h4|h5|h6">
-<!ENTITY % lists "ul | ol | dl | menu | dir">
-<!ENTITY % blocktext "pre | hr | blockquote | address | center | noframes">
-
-<!ENTITY % block
-    "p | %heading; | div | %lists; | %blocktext; | isindex |fieldset | table">
-
-<!-- %Flow; mixes block and inline and is used for list items etc. -->
-<!ENTITY % Flow "(#PCDATA | %block; | form | %inline; | %misc;)*">
-
-<!--================== Content models for exclusions =====================-->
-
-<!-- a elements use %Inline; excluding a -->
-
-<!ENTITY % a.content
-   "(#PCDATA | %special; | %fontstyle; | %phrase; | %inline.forms; | %misc.inline;)*">
-
-<!-- pre uses %Inline excluding img, object, applet, big, small,
-     font, or basefont -->
-
-<!ENTITY % pre.content
-   "(#PCDATA | a | %special.basic; | %fontstyle.basic; | %phrase.basic; |
-	   %inline.forms; | %misc.inline;)*">
-
-<!-- form uses %Flow; excluding form -->
-
-<!ENTITY % form.content "(#PCDATA | %block; | %inline; | %misc;)*">
-
-<!-- button uses %Flow; but excludes a, form, form controls, iframe -->
-
-<!ENTITY % button.content
-   "(#PCDATA | p | %heading; | div | %lists; | %blocktext; |
-      table | br | span | bdo | object | applet | img | map |
-      %fontstyle; | %phrase; | %misc;)*">
-
-<!--================ Document Structure ==================================-->
-
-<!-- the namespace URI designates the document profile -->
-
-<!ELEMENT html (head, body)>
-<!ATTLIST html
-  %i18n;
-  id          ID             #IMPLIED
-  xmlns       %URI;          #FIXED 'http://www.w3.org/1999/xhtml'
-  >
-
-<!--================ Document Head =======================================-->
-
-<!ENTITY % head.misc "(script|style|meta|link|object|isindex)*">
-
-<!-- content model is %head.misc; combined with a single
-     title and an optional base element in any order -->
-
-<!ELEMENT head (%head.misc;,
-     ((title, %head.misc;, (base, %head.misc;)?) |
-      (base, %head.misc;, (title, %head.misc;))))>
-
-<!ATTLIST head
-  %i18n;
-  id          ID             #IMPLIED
-  profile     %URI;          #IMPLIED
-  >
-
-<!-- The title element is not considered part of the flow of text.
-       It should be displayed, for example as the page header or
-       window title. Exactly one title is required per document.
-    -->
-<!ELEMENT title (#PCDATA)>
-<!ATTLIST title 
-  %i18n;
-  id          ID             #IMPLIED
-  >
-
-<!-- document base URI -->
-
-<!ELEMENT base EMPTY>
-<!ATTLIST base
-  id          ID             #IMPLIED
-  href        %URI;          #IMPLIED
-  target      %FrameTarget;  #IMPLIED
-  >
-
-<!-- generic metainformation -->
-<!ELEMENT meta EMPTY>
-<!ATTLIST meta
-  %i18n;
-  id          ID             #IMPLIED
-  http-equiv  CDATA          #IMPLIED
-  name        CDATA          #IMPLIED
-  content     CDATA          #REQUIRED
-  scheme      CDATA          #IMPLIED
-  >
-
-<!--
-  Relationship values can be used in principle:
-
-   a) for document specific toolbars/menus when used
-      with the link element in document head e.g.
-        start, contents, previous, next, index, end, help
-   b) to link to a separate style sheet (rel="stylesheet")
-   c) to make a link to a script (rel="script")
-   d) by stylesheets to control how collections of
-      html nodes are rendered into printed documents
-   e) to make a link to a printable version of this document
-      e.g. a PostScript or PDF version (rel="alternate" media="print")
--->
-
-<!ELEMENT link EMPTY>
-<!ATTLIST link
-  %attrs;
-  charset     %Charset;      #IMPLIED
-  href        %URI;          #IMPLIED
-  hreflang    %LanguageCode; #IMPLIED
-  type        %ContentType;  #IMPLIED
-  rel         %LinkTypes;    #IMPLIED
-  rev         %LinkTypes;    #IMPLIED
-  media       %MediaDesc;    #IMPLIED
-  target      %FrameTarget;  #IMPLIED
-  >
-
-<!-- style info, which may include CDATA sections -->
-<!ELEMENT style (#PCDATA)>
-<!ATTLIST style
-  %i18n;
-  id          ID             #IMPLIED
-  type        %ContentType;  #REQUIRED
-  media       %MediaDesc;    #IMPLIED
-  title       %Text;         #IMPLIED
-  xml:space   (preserve)     #FIXED 'preserve'
-  >
-
-<!-- script statements, which may include CDATA sections -->
-<!ELEMENT script (#PCDATA)>
-<!ATTLIST script
-  id          ID             #IMPLIED
-  charset     %Charset;      #IMPLIED
-  type        %ContentType;  #REQUIRED
-  language    CDATA          #IMPLIED
-  src         %URI;          #IMPLIED
-  defer       (defer)        #IMPLIED
-  xml:space   (preserve)     #FIXED 'preserve'
-  >
-
-<!-- alternate content container for non script-based rendering -->
-
-<!ELEMENT noscript %Flow;>
-<!ATTLIST noscript
-  %attrs;
-  >
-
-<!--======================= Frames =======================================-->
-
-<!-- inline subwindow -->
-
-<!ELEMENT iframe %Flow;>
-<!ATTLIST iframe
-  %coreattrs;
-  longdesc    %URI;          #IMPLIED
-  name        NMTOKEN        #IMPLIED
-  src         %URI;          #IMPLIED
-  frameborder (1|0)          "1"
-  marginwidth %Pixels;       #IMPLIED
-  marginheight %Pixels;      #IMPLIED
-  scrolling   (yes|no|auto)  "auto"
-  align       %ImgAlign;     #IMPLIED
-  height      %Length;       #IMPLIED
-  width       %Length;       #IMPLIED
-  >
-
-<!-- alternate content container for non frame-based rendering -->
-
-<!ELEMENT noframes %Flow;>
-<!ATTLIST noframes
-  %attrs;
-  >
-
-<!--=================== Document Body ====================================-->
-
-<!ELEMENT body %Flow;>
-<!ATTLIST body
-  %attrs;
-  onload      %Script;       #IMPLIED
-  onunload    %Script;       #IMPLIED
-  background  %URI;          #IMPLIED
-  bgcolor     %Color;        #IMPLIED
-  text        %Color;        #IMPLIED
-  link        %Color;        #IMPLIED
-  vlink       %Color;        #IMPLIED
-  alink       %Color;        #IMPLIED
-  >
-
-<!ELEMENT div %Flow;>  <!-- generic language/style container -->
-<!ATTLIST div
-  %attrs;
-  %TextAlign;
-  >
-
-<!--=================== Paragraphs =======================================-->
-
-<!ELEMENT p %Inline;>
-<!ATTLIST p
-  %attrs;
-  %TextAlign;
-  >
-
-<!--=================== Headings =========================================-->
-
-<!--
-  There are six levels of headings from h1 (the most important)
-  to h6 (the least important).
--->
-
-<!ELEMENT h1  %Inline;>
-<!ATTLIST h1
-  %attrs;
-  %TextAlign;
-  >
-
-<!ELEMENT h2 %Inline;>
-<!ATTLIST h2
-  %attrs;
-  %TextAlign;
-  >
-
-<!ELEMENT h3 %Inline;>
-<!ATTLIST h3
-  %attrs;
-  %TextAlign;
-  >
-
-<!ELEMENT h4 %Inline;>
-<!ATTLIST h4
-  %attrs;
-  %TextAlign;
-  >
-
-<!ELEMENT h5 %Inline;>
-<!ATTLIST h5
-  %attrs;
-  %TextAlign;
-  >
-
-<!ELEMENT h6 %Inline;>
-<!ATTLIST h6
-  %attrs;
-  %TextAlign;
-  >
-
-<!--=================== Lists ============================================-->
-
-<!-- Unordered list bullet styles -->
-
-<!ENTITY % ULStyle "(disc|square|circle)">
-
-<!-- Unordered list -->
-
-<!ELEMENT ul (li)+>
-<!ATTLIST ul
-  %attrs;
-  type        %ULStyle;     #IMPLIED
-  compact     (compact)     #IMPLIED
-  >
-
-<!-- Ordered list numbering style
-
-    1   arabic numbers      1, 2, 3, ...
-    a   lower alpha         a, b, c, ...
-    A   upper alpha         A, B, C, ...
-    i   lower roman         i, ii, iii, ...
-    I   upper roman         I, II, III, ...
-
-    The style is applied to the sequence number which by default
-    is reset to 1 for the first list item in an ordered list.
--->
-<!ENTITY % OLStyle "CDATA">
-
-<!-- Ordered (numbered) list -->
-
-<!ELEMENT ol (li)+>
-<!ATTLIST ol
-  %attrs;
-  type        %OLStyle;      #IMPLIED
-  compact     (compact)      #IMPLIED
-  start       %Number;       #IMPLIED
-  >
-
-<!-- single column list (DEPRECATED) --> 
-<!ELEMENT menu (li)+>
-<!ATTLIST menu
-  %attrs;
-  compact     (compact)     #IMPLIED
-  >
-
-<!-- multiple column list (DEPRECATED) --> 
-<!ELEMENT dir (li)+>
-<!ATTLIST dir
-  %attrs;
-  compact     (compact)     #IMPLIED
-  >
-
-<!-- LIStyle is constrained to: "(%ULStyle;|%OLStyle;)" -->
-<!ENTITY % LIStyle "CDATA">
-
-<!-- list item -->
-
-<!ELEMENT li %Flow;>
-<!ATTLIST li
-  %attrs;
-  type        %LIStyle;      #IMPLIED
-  value       %Number;       #IMPLIED
-  >
-
-<!-- definition lists - dt for term, dd for its definition -->
-
-<!ELEMENT dl (dt|dd)+>
-<!ATTLIST dl
-  %attrs;
-  compact     (compact)      #IMPLIED
-  >
-
-<!ELEMENT dt %Inline;>
-<!ATTLIST dt
-  %attrs;
-  >
-
-<!ELEMENT dd %Flow;>
-<!ATTLIST dd
-  %attrs;
-  >
-
-<!--=================== Address ==========================================-->
-
-<!-- information on author -->
-
-<!ELEMENT address (#PCDATA | %inline; | %misc.inline; | p)*>
-<!ATTLIST address
-  %attrs;
-  >
-
-<!--=================== Horizontal Rule ==================================-->
-
-<!ELEMENT hr EMPTY>
-<!ATTLIST hr
-  %attrs;
-  align       (left|center|right) #IMPLIED
-  noshade     (noshade)      #IMPLIED
-  size        %Pixels;       #IMPLIED
-  width       %Length;       #IMPLIED
-  >
-
-<!--=================== Preformatted Text ================================-->
-
-<!-- content is %Inline; excluding 
-        "img|object|applet|big|small|sub|sup|font|basefont" -->
-
-<!ELEMENT pre %pre.content;>
-<!ATTLIST pre
-  %attrs;
-  width       %Number;      #IMPLIED
-  xml:space   (preserve)    #FIXED 'preserve'
-  >
-
-<!--=================== Block-like Quotes ================================-->
-
-<!ELEMENT blockquote %Flow;>
-<!ATTLIST blockquote
-  %attrs;
-  cite        %URI;          #IMPLIED
-  >
-
-<!--=================== Text alignment ===================================-->
-
-<!-- center content -->
-<!ELEMENT center %Flow;>
-<!ATTLIST center
-  %attrs;
-  >
-
-<!--=================== Inserted/Deleted Text ============================-->
-
-<!--
-  ins/del are allowed in block and inline content, but its
-  inappropriate to include block content within an ins element
-  occurring in inline content.
--->
-<!ELEMENT ins %Flow;>
-<!ATTLIST ins
-  %attrs;
-  cite        %URI;          #IMPLIED
-  datetime    %Datetime;     #IMPLIED
-  >
-
-<!ELEMENT del %Flow;>
-<!ATTLIST del
-  %attrs;
-  cite        %URI;          #IMPLIED
-  datetime    %Datetime;     #IMPLIED
-  >
-
-<!--================== The Anchor Element ================================-->
-
-<!-- content is %Inline; except that anchors shouldn't be nested -->
-
-<!ELEMENT a %a.content;>
-<!ATTLIST a
-  %attrs;
-  %focus;
-  charset     %Charset;      #IMPLIED
-  type        %ContentType;  #IMPLIED
-  name        NMTOKEN        #IMPLIED
-  href        %URI;          #IMPLIED
-  hreflang    %LanguageCode; #IMPLIED
-  rel         %LinkTypes;    #IMPLIED
-  rev         %LinkTypes;    #IMPLIED
-  shape       %Shape;        "rect"
-  coords      %Coords;       #IMPLIED
-  target      %FrameTarget;  #IMPLIED
-  >
-
-<!--===================== Inline Elements ================================-->
-
-<!ELEMENT span %Inline;> <!-- generic language/style container -->
-<!ATTLIST span
-  %attrs;
-  >
-
-<!ELEMENT bdo %Inline;>  <!-- I18N BiDi over-ride -->
-<!ATTLIST bdo
-  %coreattrs;
-  %events;
-  lang        %LanguageCode; #IMPLIED
-  xml:lang    %LanguageCode; #IMPLIED
-  dir         (ltr|rtl)      #REQUIRED
-  >
-
-<!ELEMENT br EMPTY>   <!-- forced line break -->
-<!ATTLIST br
-  %coreattrs;
-  clear       (left|all|right|none) "none"
-  >
-
-<!ELEMENT em %Inline;>   <!-- emphasis -->
-<!ATTLIST em %attrs;>
-
-<!ELEMENT strong %Inline;>   <!-- strong emphasis -->
-<!ATTLIST strong %attrs;>
-
-<!ELEMENT dfn %Inline;>   <!-- definitional -->
-<!ATTLIST dfn %attrs;>
-
-<!ELEMENT code %Inline;>   <!-- program code -->
-<!ATTLIST code %attrs;>
-
-<!ELEMENT samp %Inline;>   <!-- sample -->
-<!ATTLIST samp %attrs;>
-
-<!ELEMENT kbd %Inline;>  <!-- something user would type -->
-<!ATTLIST kbd %attrs;>
-
-<!ELEMENT var %Inline;>   <!-- variable -->
-<!ATTLIST var %attrs;>
-
-<!ELEMENT cite %Inline;>   <!-- citation -->
-<!ATTLIST cite %attrs;>
-
-<!ELEMENT abbr %Inline;>   <!-- abbreviation -->
-<!ATTLIST abbr %attrs;>
-
-<!ELEMENT acronym %Inline;>   <!-- acronym -->
-<!ATTLIST acronym %attrs;>
-
-<!ELEMENT q %Inline;>   <!-- inlined quote -->
-<!ATTLIST q
-  %attrs;
-  cite        %URI;          #IMPLIED
-  >
-
-<!ELEMENT sub %Inline;> <!-- subscript -->
-<!ATTLIST sub %attrs;>
-
-<!ELEMENT sup %Inline;> <!-- superscript -->
-<!ATTLIST sup %attrs;>
-
-<!ELEMENT tt %Inline;>   <!-- fixed pitch font -->
-<!ATTLIST tt %attrs;>
-
-<!ELEMENT i %Inline;>   <!-- italic font -->
-<!ATTLIST i %attrs;>
-
-<!ELEMENT b %Inline;>   <!-- bold font -->
-<!ATTLIST b %attrs;>
-
-<!ELEMENT big %Inline;>   <!-- bigger font -->
-<!ATTLIST big %attrs;>
-
-<!ELEMENT small %Inline;>   <!-- smaller font -->
-<!ATTLIST small %attrs;>
-
-<!ELEMENT u %Inline;>   <!-- underline -->
-<!ATTLIST u %attrs;>
-
-<!ELEMENT s %Inline;>   <!-- strike-through -->
-<!ATTLIST s %attrs;>
-
-<!ELEMENT strike %Inline;>   <!-- strike-through -->
-<!ATTLIST strike %attrs;>
-
-<!ELEMENT basefont EMPTY>  <!-- base font size -->
-<!ATTLIST basefont
-  id          ID             #IMPLIED
-  size        CDATA          #REQUIRED
-  color       %Color;        #IMPLIED
-  face        CDATA          #IMPLIED
-  >
-
-<!ELEMENT font %Inline;> <!-- local change to font -->
-<!ATTLIST font
-  %coreattrs;
-  %i18n;
-  size        CDATA          #IMPLIED
-  color       %Color;        #IMPLIED
-  face        CDATA          #IMPLIED
-  >
-
-<!--==================== Object ======================================-->
-<!--
-  object is used to embed objects as part of HTML pages.
-  param elements should precede other content. Parameters
-  can also be expressed as attribute/value pairs on the
-  object element itself when brevity is desired.
--->
-
-<!ELEMENT object (#PCDATA | param | %block; | form | %inline; | %misc;)*>
-<!ATTLIST object
-  %attrs;
-  declare     (declare)      #IMPLIED
-  classid     %URI;          #IMPLIED
-  codebase    %URI;          #IMPLIED
-  data        %URI;          #IMPLIED
-  type        %ContentType;  #IMPLIED
-  codetype    %ContentType;  #IMPLIED
-  archive     %UriList;      #IMPLIED
-  standby     %Text;         #IMPLIED
-  height      %Length;       #IMPLIED
-  width       %Length;       #IMPLIED
-  usemap      %URI;          #IMPLIED
-  name        NMTOKEN        #IMPLIED
-  tabindex    %Number;       #IMPLIED
-  align       %ImgAlign;     #IMPLIED
-  border      %Pixels;       #IMPLIED
-  hspace      %Pixels;       #IMPLIED
-  vspace      %Pixels;       #IMPLIED
-  >
-
-<!--
-  param is used to supply a named property value.
-  In XML it would seem natural to follow RDF and support an
-  abbreviated syntax where the param elements are replaced
-  by attribute value pairs on the object start tag.
--->
-<!ELEMENT param EMPTY>
-<!ATTLIST param
-  id          ID             #IMPLIED
-  name        CDATA          #REQUIRED
-  value       CDATA          #IMPLIED
-  valuetype   (data|ref|object) "data"
-  type        %ContentType;  #IMPLIED
-  >
-
-<!--=================== Java applet ==================================-->
-<!--
-  One of code or object attributes must be present.
-  Place param elements before other content.
--->
-<!ELEMENT applet (#PCDATA | param | %block; | form | %inline; | %misc;)*>
-<!ATTLIST applet
-  %coreattrs;
-  codebase    %URI;          #IMPLIED
-  archive     CDATA          #IMPLIED
-  code        CDATA          #IMPLIED
-  object      CDATA          #IMPLIED
-  alt         %Text;         #IMPLIED
-  name        NMTOKEN        #IMPLIED
-  width       %Length;       #REQUIRED
-  height      %Length;       #REQUIRED
-  align       %ImgAlign;     #IMPLIED
-  hspace      %Pixels;       #IMPLIED
-  vspace      %Pixels;       #IMPLIED
-  >
-
-<!--=================== Images ===========================================-->
-
-<!--
-   To avoid accessibility problems for people who aren't
-   able to see the image, you should provide a text
-   description using the alt and longdesc attributes.
-   In addition, avoid the use of server-side image maps.
--->
-
-<!ELEMENT img EMPTY>
-<!ATTLIST img
-  %attrs;
-  src         %URI;          #REQUIRED
-  alt         %Text;         #REQUIRED
-  name        NMTOKEN        #IMPLIED
-  longdesc    %URI;          #IMPLIED
-  height      %Length;       #IMPLIED
-  width       %Length;       #IMPLIED
-  usemap      %URI;          #IMPLIED
-  ismap       (ismap)        #IMPLIED
-  align       %ImgAlign;     #IMPLIED
-  border      %Length;       #IMPLIED
-  hspace      %Pixels;       #IMPLIED
-  vspace      %Pixels;       #IMPLIED
-  >
-
-<!-- usemap points to a map element which may be in this document
-  or an external document, although the latter is not widely supported -->
-
-<!--================== Client-side image maps ============================-->
-
-<!-- These can be placed in the same document or grouped in a
-     separate document although this isn't yet widely supported -->
-
-<!ELEMENT map ((%block; | form | %misc;)+ | area+)>
-<!ATTLIST map
-  %i18n;
-  %events;
-  id          ID             #REQUIRED
-  class       CDATA          #IMPLIED
-  style       %StyleSheet;   #IMPLIED
-  title       %Text;         #IMPLIED
-  name        CDATA          #IMPLIED
-  >
-
-<!ELEMENT area EMPTY>
-<!ATTLIST area
-  %attrs;
-  %focus;
-  shape       %Shape;        "rect"
-  coords      %Coords;       #IMPLIED
-  href        %URI;          #IMPLIED
-  nohref      (nohref)       #IMPLIED
-  alt         %Text;         #REQUIRED
-  target      %FrameTarget;  #IMPLIED
-  >
-
-<!--================ Forms ===============================================-->
-
-<!ELEMENT form %form.content;>   <!-- forms shouldn't be nested -->
-
-<!ATTLIST form
-  %attrs;
-  action      %URI;          #REQUIRED
-  method      (get|post)     "get"
-  name        NMTOKEN        #IMPLIED
-  enctype     %ContentType;  "application/x-www-form-urlencoded"
-  onsubmit    %Script;       #IMPLIED
-  onreset     %Script;       #IMPLIED
-  accept      %ContentTypes; #IMPLIED
-  accept-charset %Charsets;  #IMPLIED
-  target      %FrameTarget;  #IMPLIED
-  >
-
-<!--
-  Each label must not contain more than ONE field
-  Label elements shouldn't be nested.
--->
-<!ELEMENT label %Inline;>
-<!ATTLIST label
-  %attrs;
-  for         IDREF          #IMPLIED
-  accesskey   %Character;    #IMPLIED
-  onfocus     %Script;       #IMPLIED
-  onblur      %Script;       #IMPLIED
-  >
-
-<!ENTITY % InputType
-  "(text | password | checkbox |
-    radio | submit | reset |
-    file | hidden | image | button)"
-   >
-
-<!-- the name attribute is required for all but submit & reset -->
-
-<!ELEMENT input EMPTY>     <!-- form control -->
-<!ATTLIST input
-  %attrs;
-  %focus;
-  type        %InputType;    "text"
-  name        CDATA          #IMPLIED
-  value       CDATA          #IMPLIED
-  checked     (checked)      #IMPLIED
-  disabled    (disabled)     #IMPLIED
-  readonly    (readonly)     #IMPLIED
-  size        CDATA          #IMPLIED
-  maxlength   %Number;       #IMPLIED
-  src         %URI;          #IMPLIED
-  alt         CDATA          #IMPLIED
-  usemap      %URI;          #IMPLIED
-  onselect    %Script;       #IMPLIED
-  onchange    %Script;       #IMPLIED
-  accept      %ContentTypes; #IMPLIED
-  align       %ImgAlign;     #IMPLIED
-  >
-
-<!ELEMENT select (optgroup|option)+>  <!-- option selector -->
-<!ATTLIST select
-  %attrs;
-  name        CDATA          #IMPLIED
-  size        %Number;       #IMPLIED
-  multiple    (multiple)     #IMPLIED
-  disabled    (disabled)     #IMPLIED
-  tabindex    %Number;       #IMPLIED
-  onfocus     %Script;       #IMPLIED
-  onblur      %Script;       #IMPLIED
-  onchange    %Script;       #IMPLIED
-  >
-
-<!ELEMENT optgroup (option)+>   <!-- option group -->
-<!ATTLIST optgroup
-  %attrs;
-  disabled    (disabled)     #IMPLIED
-  label       %Text;         #REQUIRED
-  >
-
-<!ELEMENT option (#PCDATA)>     <!-- selectable choice -->
-<!ATTLIST option
-  %attrs;
-  selected    (selected)     #IMPLIED
-  disabled    (disabled)     #IMPLIED
-  label       %Text;         #IMPLIED
-  value       CDATA          #IMPLIED
-  >
-
-<!ELEMENT textarea (#PCDATA)>     <!-- multi-line text field -->
-<!ATTLIST textarea
-  %attrs;
-  %focus;
-  name        CDATA          #IMPLIED
-  rows        %Number;       #REQUIRED
-  cols        %Number;       #REQUIRED
-  disabled    (disabled)     #IMPLIED
-  readonly    (readonly)     #IMPLIED
-  onselect    %Script;       #IMPLIED
-  onchange    %Script;       #IMPLIED
-  >
-
-<!--
-  The fieldset element is used to group form fields.
-  Only one legend element should occur in the content
-  and if present should only be preceded by whitespace.
--->
-<!ELEMENT fieldset (#PCDATA | legend | %block; | form | %inline; | %misc;)*>
-<!ATTLIST fieldset
-  %attrs;
-  >
-
-<!ENTITY % LAlign "(top|bottom|left|right)">
-
-<!ELEMENT legend %Inline;>     <!-- fieldset label -->
-<!ATTLIST legend
-  %attrs;
-  accesskey   %Character;    #IMPLIED
-  align       %LAlign;       #IMPLIED
-  >
-
-<!--
- Content is %Flow; excluding a, form, form controls, iframe
---> 
-<!ELEMENT button %button.content;>  <!-- push button -->
-<!ATTLIST button
-  %attrs;
-  %focus;
-  name        CDATA          #IMPLIED
-  value       CDATA          #IMPLIED
-  type        (button|submit|reset) "submit"
-  disabled    (disabled)     #IMPLIED
-  >
-
-<!-- single-line text input control (DEPRECATED) -->
-<!ELEMENT isindex EMPTY>
-<!ATTLIST isindex
-  %coreattrs;
-  %i18n;
-  prompt      %Text;         #IMPLIED
-  >
-
-<!--======================= Tables =======================================-->
-
-<!-- Derived from IETF HTML table standard, see [RFC1942] -->
-
-<!--
- The border attribute sets the thickness of the frame around the
- table. The default units are screen pixels.
-
- The frame attribute specifies which parts of the frame around
- the table should be rendered. The values are not the same as
- CALS to avoid a name clash with the valign attribute.
--->
-<!ENTITY % TFrame "(void|above|below|hsides|lhs|rhs|vsides|box|border)">
-
-<!--
- The rules attribute defines which rules to draw between cells:
-
- If rules is absent then assume:
-     "none" if border is absent or border="0" otherwise "all"
--->
-
-<!ENTITY % TRules "(none | groups | rows | cols | all)">
-  
-<!-- horizontal placement of table relative to document -->
-<!ENTITY % TAlign "(left|center|right)">
-
-<!-- horizontal alignment attributes for cell contents
-
-  char        alignment char, e.g. char=':'
-  charoff     offset for alignment char
--->
-<!ENTITY % cellhalign
-  "align      (left|center|right|justify|char) #IMPLIED
-   char       %Character;    #IMPLIED
-   charoff    %Length;       #IMPLIED"
-  >
-
-<!-- vertical alignment attributes for cell contents -->
-<!ENTITY % cellvalign
-  "valign     (top|middle|bottom|baseline) #IMPLIED"
-  >
-
-<!ELEMENT table
-     (caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))>
-<!ELEMENT caption  %Inline;>
-<!ELEMENT thead    (tr)+>
-<!ELEMENT tfoot    (tr)+>
-<!ELEMENT tbody    (tr)+>
-<!ELEMENT colgroup (col)*>
-<!ELEMENT col      EMPTY>
-<!ELEMENT tr       (th|td)+>
-<!ELEMENT th       %Flow;>
-<!ELEMENT td       %Flow;>
-
-<!ATTLIST table
-  %attrs;
-  summary     %Text;         #IMPLIED
-  width       %Length;       #IMPLIED
-  border      %Pixels;       #IMPLIED
-  frame       %TFrame;       #IMPLIED
-  rules       %TRules;       #IMPLIED
-  cellspacing %Length;       #IMPLIED
-  cellpadding %Length;       #IMPLIED
-  align       %TAlign;       #IMPLIED
-  bgcolor     %Color;        #IMPLIED
-  >
-
-<!ENTITY % CAlign "(top|bottom|left|right)">
-
-<!ATTLIST caption
-  %attrs;
-  align       %CAlign;       #IMPLIED
-  >
-
-<!--
-colgroup groups a set of col elements. It allows you to group
-several semantically related columns together.
--->
-<!ATTLIST colgroup
-  %attrs;
-  span        %Number;       "1"
-  width       %MultiLength;  #IMPLIED
-  %cellhalign;
-  %cellvalign;
-  >
-
-<!--
- col elements define the alignment properties for cells in
- one or more columns.
-
- The width attribute specifies the width of the columns, e.g.
-
-     width=64        width in screen pixels
-     width=0.5*      relative width of 0.5
-
- The span attribute causes the attributes of one
- col element to apply to more than one column.
--->
-<!ATTLIST col
-  %attrs;
-  span        %Number;       "1"
-  width       %MultiLength;  #IMPLIED
-  %cellhalign;
-  %cellvalign;
-  >
-
-<!--
-    Use thead to duplicate headers when breaking table
-    across page boundaries, or for static headers when
-    tbody sections are rendered in scrolling panel.
-
-    Use tfoot to duplicate footers when breaking table
-    across page boundaries, or for static footers when
-    tbody sections are rendered in scrolling panel.
-
-    Use multiple tbody sections when rules are needed
-    between groups of table rows.
--->
-<!ATTLIST thead
-  %attrs;
-  %cellhalign;
-  %cellvalign;
-  >
-
-<!ATTLIST tfoot
-  %attrs;
-  %cellhalign;
-  %cellvalign;
-  >
-
-<!ATTLIST tbody
-  %attrs;
-  %cellhalign;
-  %cellvalign;
-  >
-
-<!ATTLIST tr
-  %attrs;
-  %cellhalign;
-  %cellvalign;
-  bgcolor     %Color;        #IMPLIED
-  >
-
-<!-- Scope is simpler than headers attribute for common tables -->
-<!ENTITY % Scope "(row|col|rowgroup|colgroup)">
-
-<!-- th is for headers, td for data and for cells acting as both -->
-
-<!ATTLIST th
-  %attrs;
-  abbr        %Text;         #IMPLIED
-  axis        CDATA          #IMPLIED
-  headers     IDREFS         #IMPLIED
-  scope       %Scope;        #IMPLIED
-  rowspan     %Number;       "1"
-  colspan     %Number;       "1"
-  %cellhalign;
-  %cellvalign;
-  nowrap      (nowrap)       #IMPLIED
-  bgcolor     %Color;        #IMPLIED
-  width       %Length;       #IMPLIED
-  height      %Length;       #IMPLIED
-  >
-
-<!ATTLIST td
-  %attrs;
-  abbr        %Text;         #IMPLIED
-  axis        CDATA          #IMPLIED
-  headers     IDREFS         #IMPLIED
-  scope       %Scope;        #IMPLIED
-  rowspan     %Number;       "1"
-  colspan     %Number;       "1"
-  %cellhalign;
-  %cellvalign;
-  nowrap      (nowrap)       #IMPLIED
-  bgcolor     %Color;        #IMPLIED
-  width       %Length;       #IMPLIED
-  height      %Length;       #IMPLIED
-  >
-
Index: src/web/locale/org/nutch/jsp/text.properties
===================================================================
--- src/web/locale/org/nutch/jsp/text.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/text.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Nutch Plain Text Cache
-note = This is the plain text version of the file: <a href="{0}">{0}</a>.
-noText = <i>Sorry, no plain text version is available.</i>
Index: src/web/locale/org/nutch/jsp/search_en.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_en.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_en.properties	(working copy)
@@ -1,11 +0,0 @@
-title = search results
-search = Search
-hits = Hits <b>{0}-{1}</b> (out of about {2} total matching pages):
-cached = cached
-explain = explain
-anchors = anchors
-next = next page
-moreFrom = more from
-showAllHits = show all hits
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/anchors_pl.properties
===================================================================
--- src/web/locale/org/nutch/jsp/anchors_pl.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/anchors_pl.properties	(working copy)
@@ -1,3 +0,0 @@
-title = odno\u015bniki
-anchors = tekst przychodz\u0105cego odno\u015bnika:
-page = strona: <a href="{0}">{0}</a>
Index: src/web/locale/org/nutch/jsp/cached_en.properties
===================================================================
--- src/web/locale/org/nutch/jsp/cached_en.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/cached_en.properties	(working copy)
@@ -1,3 +0,0 @@
-title = nutch cache
-page = page: <a href="{0}">{0}</a>
-noContent = <i>Sorry, no content is cached for this page.</i>
Index: src/web/locale/org/nutch/jsp/explain_pl.properties
===================================================================
--- src/web/locale/org/nutch/jsp/explain_pl.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/explain_pl.properties	(working copy)
@@ -1,3 +0,0 @@
-title = wyja\u015bnienie wyniku
-page = strona
-scoreForQuery = wynik dla zapytania: <tt>{0}</tt>
Index: src/web/locale/org/nutch/jsp/search_es.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_es.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_es.properties	(working copy)
@@ -1,9 +0,0 @@
-﻿title = resultados de la bÃºsqueda
-search = Buscar
-hits = Resultados <b>{0}-{1}</b> (de un total de {2} documentos):
-cached = en cachÃ©
-explain = explicar
-anchors = anchors
-next = Siguiente
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/search_fr.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_fr.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_fr.properties	(working copy)
@@ -1,9 +0,0 @@
-﻿title = Résultats de recherche
-search = Recherche
-hits = Résultats <b>{0}-{1}</b> (sur un total de {2}):
-cached = En cache
-explain = Explications
-anchors = Ancres
-next = Suivant
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/text_de.properties
===================================================================
--- src/web/locale/org/nutch/jsp/text_de.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/text_de.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Nutch Text Cache
-note = Dies ist die Text-Version von: <a href="{0}">{0}</a>.
-noText = <i>Leider ist keine Text-Version verf&uuml;gbar.</i>
Index: src/web/locale/org/nutch/jsp/anchors.properties
===================================================================
--- src/web/locale/org/nutch/jsp/anchors.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/anchors.properties	(working copy)
@@ -1,3 +0,0 @@
-title = anchors
-anchors = incoming anchor text:
-page = page: <a href="{0}">{0}</a>
Index: src/web/locale/org/nutch/jsp/search_nl.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_nl.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_nl.properties	(working copy)
@@ -1,9 +0,0 @@
-title = zoekresultaten
-search = Zoek
-hits = Treffers <b>{0}-{1}</b> (van in totaal {2} gevonden documenten):
-cached = opgeslagen
-explain = uitleg
-anchors = verwijzers
-next = Volgende
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/explain.properties
===================================================================
--- src/web/locale/org/nutch/jsp/explain.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/explain.properties	(working copy)
@@ -1,3 +0,0 @@
-title = score explanation
-page = page
-scoreForQuery = score for query: <tt>{0}</tt>
Index: src/web/locale/org/nutch/jsp/search_pl.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_pl.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_pl.properties	(working copy)
@@ -1,11 +0,0 @@
-title = wyniki wyszukiwania
-search = Szukaj
-hits = Wyniki <b>{0}-{1}</b> (z oko\u0142o {2} pasuj\u0105cych dokument\u00f3w):
-cached = kopia
-explain = wyja\u015bnij
-anchors = odno\u015bniki
-next = Nast\u0119pna strona
-moreFrom = Wi\u0119cej z
-showAllHits = Poka\u017c wszystkie wyniki
-clustering = grupowanie
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/search_th.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_th.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_th.properties	(working copy)
@@ -1,9 +0,0 @@
-title = \u0e1c\u0e25\u0e01\u0e32\u0e23\u0e04\u0e49\u0e19\u0e2b\u0e32
-search = \u0e04\u0e49\u0e19\u0e2b\u0e32
-hits = \u0e1e\u0e1a <b>{0}-{1}</b> (\u0e08\u0e32\u0e01\u0e17\u0e31\u0e49\u0e07\u0e2b\u0e21\u0e14 {2} \u0e2b\u0e19\u0e49\u0e32):
-cached = \u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48\u0e40\u0e01\u0e47\u0e1a\u0e44\u0e27\u0e49
-explain = \u0e04\u0e33\u0e2d\u0e18\u0e34\u0e1a\u0e32\u0e22
-anchors = \u0e41\u0e2d\u0e07\u0e40\u0e04\u0e2d\u0e23\u0e4c
-next = \u0e2b\u0e19\u0e49\u0e32\u0e15\u0e48\u0e2d\u0e44\u0e1b
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/search_hu.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_hu.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_hu.properties	(working copy)
@@ -1,9 +0,0 @@
-title = keress eredmnye
-search = Keress
-hits = <b>{0}-{1}</b>. tallat a {2} kzl:
-cached = Trolt
-explain = Magyarzat
-anchors = Kapcsolds
-next = Kvetkez
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/cached_pl.properties
===================================================================
--- src/web/locale/org/nutch/jsp/cached_pl.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/cached_pl.properties	(working copy)
@@ -1,3 +0,0 @@
-title = kopia nutch-a
-page = strona: <a href="{0}">{0}</a>
-noContent = <i>Przepraszamy, brak kopii tej strony.</i>
Index: src/web/locale/org/nutch/jsp/search_ms.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_ms.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_ms.properties	(working copy)
@@ -1,9 +0,0 @@
-title = hasil carian
-carian = Carian
-bilangan hits = Bilangan Hits &lt;b&gt;{0}-{1}&lt;/b&gt; (daripada {2} jumlah dokumen bersesuaian):
-tampungan = tampungan
-penerangan = penerangan
-penunjuk = penunjuk
-selepas = Selepas
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/search.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search.properties	(working copy)
@@ -1,11 +0,0 @@
-title = search results
-search = Search
-hits = Hits <b>{0}-{1}</b> (out of about {2} total matching pages):
-cached = cached
-explain = explain
-anchors = anchors
-next = next page
-moreFrom = more from
-showAllHits = show all hits
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/anchors_de.properties
===================================================================
--- src/web/locale/org/nutch/jsp/anchors_de.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/anchors_de.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Verweise 
-anchors = Herzeigende Link-Texte:
-page = Seite: <a href="{0}">{0}</a>
Index: src/web/locale/org/nutch/jsp/search_zh.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_zh.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_zh.properties	(working copy)
@@ -1,9 +0,0 @@
-title = \u67e5\u8be2\u7ed3\u679c
-search = \u641c\u7d22
-hits =  \u7b2c<b>{0}-{1}</b>\u9879 (\u5171\u6709 {2} \u9879\u67e5\u8be2\u7ed3\u679c):
-cached = \u7f51\u9875\u5feb\u7167
-explain = \u8bc4\u5206\u8be6\u89e3
-anchors = anchors
-next = \u4e0b\u4e00\u9875
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/text_en.properties
===================================================================
--- src/web/locale/org/nutch/jsp/text_en.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/text_en.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Nutch Plain Text Cache
-note = This is the plain text version of the file: <a href="{0}">{0}</a>.
-noText = <i>Sorry, no plain text version is available.</i>
Index: src/web/locale/org/nutch/jsp/cached.properties
===================================================================
--- src/web/locale/org/nutch/jsp/cached.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/cached.properties	(working copy)
@@ -1,3 +0,0 @@
-title = nutch cache
-page = page: <a href="{0}">{0}</a>
-noContent = <i>Sorry, no content is cached for this page.</i>
Index: src/web/locale/org/nutch/jsp/search_ca.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_ca.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_ca.properties	(working copy)
@@ -1,9 +0,0 @@
-title = resultats de cerca
-search = Cerca
-hits = Coincidncies <b>{0}-{1}</b> (d'un total de {2} documents coincidents):
-cached = en memria
-explain = explicaci
-anchors = ncores
-next = Segent
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/explain_de.properties
===================================================================
--- src/web/locale/org/nutch/jsp/explain_de.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/explain_de.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Seiteninformationen &amp; Rankingberechnung
-page = Gespeicherte Seiteninformationen
-scoreForQuery = Rankingberechnung f&uuml;r die Anfrage: <tt>{0}</tt>
Index: src/web/locale/org/nutch/jsp/search_pt.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_pt.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_pt.properties	(working copy)
@@ -1,9 +0,0 @@
-title = resultados da pesquisa
-search = Localizar
-hits = Resultados <b>{0}-{1}</b> (de um total de {2} documentos):
-cached = arquivado
-explain = explicao
-anchors = semelhantes
-next = Prximo
-clustering = clustering
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/search_de.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_de.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_de.properties	(working copy)
@@ -1,11 +0,0 @@
-title = Suchergebnisse
-search = Suche
-hits = Treffer <b>{0}-{1}</b> (von insgesammt {2} gefundenen Seiten):
-cached = Im Cache
-explain = Erkl&auml;rung
-anchors = Referenzen
-next = Weiter
-clustering = Clustern
-viewAsText = HTML-Version
-moreFrom = Mehr von
-showAllHits = Alle Treffer anzeigen
Index: src/web/locale/org/nutch/jsp/search_sv.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_sv.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_sv.properties	(working copy)
@@ -1,11 +0,0 @@
-title = skresultat
-search = Sk
-hits = Trffar <b>{0}-{1}</b> (av ungefr totalt {2} matchande sidor):
-cached = cashad
-explain = frklara
-anchors = ankare
-next = nsta sida
-moreFrom = mer frn
-showAllHits = visa alla trffar
-clustering = klustring
-viewAsText = View as Plain Text
Index: src/web/locale/org/nutch/jsp/cached_de.properties
===================================================================
--- src/web/locale/org/nutch/jsp/cached_de.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/cached_de.properties	(working copy)
@@ -1,3 +0,0 @@
-title = Nutch Cache
-page = Seite: <a href="{0}">{0}</a>
-noContent = <i>Leider ist diese Seite nicht im Cache gespeichert.</i>
Index: src/web/locale/org/nutch/jsp/anchors_en.properties
===================================================================
--- src/web/locale/org/nutch/jsp/anchors_en.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/anchors_en.properties	(working copy)
@@ -1,3 +0,0 @@
-title = anchors
-anchors = incoming anchor text:
-page = page: <a href="{0}">{0}</a>
Index: src/web/locale/org/nutch/jsp/explain_en.properties
===================================================================
--- src/web/locale/org/nutch/jsp/explain_en.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/explain_en.properties	(working copy)
@@ -1,3 +0,0 @@
-title = score explanation
-page = page
-scoreForQuery = score for query: <tt>{0}</tt>
Index: src/web/locale/org/nutch/jsp/search_fi.properties
===================================================================
--- src/web/locale/org/nutch/jsp/search_fi.properties	(revision 959954)
+++ src/web/locale/org/nutch/jsp/search_fi.properties	(working copy)
@@ -1,11 +0,0 @@
-title = Hakutulokset
-search = Hae
-hits = Tulokset <b>{0}-{1}</b> (kaikkiaan {2} sopivasta dokumentista):
-cached = v\u00E4limuistissa
-explain = selit\u00E4
-anchors = linkit
-next = Seuraava
-clustering = ryv\u00e4stys
-moreFrom = lis\u00e4\u00e4 osumia kohteesta
-showAllHits = n\u00e4yt\u00e4 kaikki osumat
-viewAsText = View as Plain Text
Index: src/web/pages/ms/search.xml
===================================================================
--- src/web/pages/ms/search.xml	(revision 959954)
+++ src/web/pages/ms/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="ms"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Search"/>
-  <a href="help.html">Penunjuk</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/ms/help.xml
===================================================================
--- src/web/pages/ms/help.xml	(revision 959954)
+++ src/web/pages/ms/help.xml	(working copy)
@@ -1,39 +0,0 @@
-<page>
-
-<title>Petunjuk Carian</title>
-
-<body>
-
-<h3>Kueri</h3>
-Untuk mencari dengan Nutch, taip beberapa katakunci.
-<ul>
-   <li>Halaman-halaman hasil pencarian hanya mengandungi <span
- style="font-style: italic;">semua</span> perkataan yang dicari.</li>
-   <li>Gunakan tanda " " bagi perkataan yang wujud seiring, sebagai satu frasa
- seperti. <span style="font-weight: bold;">"New Zealand"</span>.</li>
-   <li>Ruang kosong diantara perkataan juga akan mencetuskan pencarian frasa. Jadi, 
- mencari <span style="font-weight: bold;">http://www.apache.org/</span>
- adalah sama seperti mencari<span style="font-weight: bold;">"http www
- apache org"</span>.</li>
-   <li>Pencarian Nutch adalah tidak kes sensitif, jadi mencari <span
- style="font-weight: bold;">NuTcH</span> adalah sama seperti mencari <span
- style="font-weight: bold;">nUtCh</span>.</li>
-   <li>Anda boleh mengelakkan pencarian untuk sesuatu perkataan dengan 
- menggunakan tanda sempang sebelum perkataan tersebut. Jadi, mencari <span style="font-weight: bold;">football
--nfl</span>, anda akan menerima halaman-halaman hasil pencarian mengenai bolasepak, tanpa
-perkataan "nfl".</li>
-  <li>Itu sahaja!</li>
-</ul>
-<h3>Hasil Pencarian</h3>
-Setiap halaman hasil pencarian akan mengandungi link berikut:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">tampungan</span>) menampilkan 
-versi halaman yang Nutch perolehi.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">penerangan</span>) menampilkan 
-deskripsi mengenai tahap kesesuaian halaman tersebut.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">penunjuk</span>) menunjukkan 
-  senarai penunjuk tuju-masuk yang diindekkan untuk halaman tersebut.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/ms/about.xml
===================================================================
--- src/web/pages/ms/about.xml	(revision 959954)
+++ src/web/pages/ms/about.xml	(working copy)
@@ -1,38 +0,0 @@
-<page>
-
-<title>perihal</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Penghargaan</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch merupakan inisiatif baru untuk membangunkan enjin carian web berasaskan prinsip sumber terbuka.</p>
-
-<p>Carian web merupakan keperluan asas untuk melayari internet, tetapi bilangan enjin pencari semakin berkurangan. Oligopoli pada hari ini akan menjadi monopoli, 
-dengan sebuah organisasi mengawal hampir keseluruhan carian web untuk keuntungan komersil. Ini tidak membawa sebarang manfaat kepada pengguna internet.</p>
-
-<p>Nutch adalah satu alternatif transparen kepada enjin-enjin carian komersil. Hanya hasil pencarian yang berasaskan konsep sumber terbuka boleh dipercayai 
-sepenuhnya tanpa berat sebelah (sekurang-kurangnya, kewujudan berat sebelah adalah umum). Kesemua enjin-enjin carian semasa memiliki formula ranking khusus, 
-dan pemilik-pemilik enjin carian tersebut tidak akan mendedahkan sebab-sebab kenapa sesuatu hasil pencarian berada pada ranking yang dipaparkan. Sesetengah 
-enjin-enjin carian mengindekkan laman-laman web berasaskan bayaran dan menghiraukan merit laman-laman web tersebut. Disebaliknya, Nutch, dengan tanpa selindung 
-dan tidak memiliki motif untuk berat sebelah dalam pencariannya, akan senantiasa berusaha memberi hasil-hasil pencarian yang terbaik kepada pengguna.</p>
-
-<p>Tujuan Nutch adalah untuk memberi kesemua masyarakat internet satu enjin carian web yang bertaraf antarabangsa, senang dan kos efektif untuk diimplementasikan. 
-Ini merupakan cabaran yang besar. Untuk berjaya, perisian Nutch mesti berupaya:</p>
-
-<ul>
-  <li>mengaut beberapa bilion laman-laman web dalam satu bulan</li>
-  <li>menyengara indek bagi laman-laman web tersebut</li>
-  <li>berupaya mencari indek tersebut pada kelajuan 1000 kali sesaat</li>
-  <li>memberi hasil-hasil pencarian yang berkualiti</li>
-  <li>beroperasi dengan kos yang minima</li>
-</ul>
-
-<p>Ini adalah proposisi yang mencabar. Jikalau anda percaya dengan merit projek ini, anda dipelawa untuk menghulurkan bantuan, samada sebagai 
-<a href="http://www.apache.org/dev/">pembangun perisian</a>atau <a href="http://www.apache.org/foundation/contributing.html">menderma</a>.</p> 
-
-</body>
-</page>
Index: src/web/pages/zh/search.xml
===================================================================
--- src/web/pages/zh/search.xml	(revision 959954)
+++ src/web/pages/zh/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<!--This page is translated by Shanfeng Zhu from Engiish to simplified Chinese!-->
-<center>
-<form name="search" action="../search.jsp" method="get"> <input
- name="query" size="44"/>&#160;<input type="submit" value="搜索"/>
-<a href="help.html">帮助</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/zh/help.xml
===================================================================
--- src/web/pages/zh/help.xml	(revision 959954)
+++ src/web/pages/zh/help.xml	(working copy)
@@ -1,31 +0,0 @@
-<page>
-
-<title>搜索帮助</title>
-
-<body>
-<!--This page is translated by Shanfeng Zhu from Engiish to simplified Chinese!-->
-<h3>查询</h3>
-使用Nutch搜索, 只需键入一些关键词语.
-<ul>
-  <li>搜索结果只包括含有 <span style="font-style: italic;">所有</span>查询词的网页.</li>
-  <li>使用双引号将相邻的词作为短语包含起来, 例如, <span style="font-weight: bold;">"New Zealand"</span>.</li>
-  <li>英文单词之间的符号会触发短语匹配. 因此搜索<span style="font-weight: bold;">http://www.apache.org/</span>
-等同于搜索<span
-style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>搜索英文单词不区分大小写, 因此搜索<span
- style="font-weight: bold;">NuTcH</span> 等同于搜索 <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>你可以在一个词前面加减号丛而禁止它出现在搜索结果中, 例如, 搜索<span style="font-weight: bold;">football
--nfl</span> 会找到讨论football, 但不出现"nfl"的网页.</li>
-</ul>
-<h3>结果</h3>
-在搜索结果中每个匹配的页面都含有如下链结:
-
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">网页快照</span>) 显示Nutch下载该网页时的内容.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">评分详解</span>)显示Nutch如何给该网页打分.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>)显示指向该网页而被Nutch索引的anchor文本.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/zh/about.xml
===================================================================
--- src/web/pages/zh/about.xml	(revision 959954)
+++ src/web/pages/zh/about.xml	(working copy)
@@ -1,47 +0,0 @@
-<page>
-
-
-<title>简介</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">参与人员</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">当前状态</a></item>
-</menu>
-
-<!--This page is translated by Shanfeng Zhu from Engiish to simplified Chinese!-->
-
-<body>
-
-
-<p>Nutch 是一个刚刚诞生开放源代码(open-source)的web搜索引擎.</p>
-
-
-<p> 尽管Web搜索是漫游Internet的基本要求, 但是现有web搜索引擎的数目却在下降.
-并且这很有可能进一步演变成为一个公司垄断了几乎所有的web搜索为其谋取商业利益.这显然
-不利于广大Internet用户.</p>
-
-<p> Nutch为我们提供了这样一个不同的选择. 相对于那些商用的搜索引擎, Nutch作为开放源代码
-搜索引擎将会更加透明, 从而更值得大家信赖. 现在所有主要的搜索引擎都采用私有的排序算法,
-而不会解释为什么一个网页会排在一个特定的位置. 除此之外, 有的搜索引擎依照网站所付的
-费用, 而不是根据它们本身的价值进行排序. 与它们不同, Nucth没有什么需要隐瞒, 也没有
-动机去扭曲搜索的结果. Nutch将尽自己最大的努力为用户提供最好的搜索结果.</p>
-
-
-<p>Nutch 致力于让每个人能很容易, 同时花费很少就可以配置世界一流的Web搜索引擎. 
-为了完成这一宏伟的目标, Nutch必须能够做到:</p>
-
-<ul>
-  <li>每个月取几十亿网页</li>
-  <li>为这些网页维护一个索引</li>
-  <li>对索引文件进行每秒上千次的搜索</li>
-  <li>提供高质量的搜索结果</li>
-  <li>以最小的成本运作</li>
-</ul>
-
-<p> 这是一个巨大的挑战. 如果你相信它的价值, 请伸出你的援手. 你可以
-参与<a
-href="http://www.apache.org/dev/">开发</a>, 或是提供<a
-href="http://www.apache.org/foundation/contributing.html">捐赠</a>. </p>
-  
-</body>
-</page>
Index: src/web/pages/pt/search.xml
===================================================================
--- src/web/pages/pt/search.xml	(revision 959954)
+++ src/web/pages/pt/search.xml	(working copy)
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="pt"/>
-  <input name="query" size="44"/><input type="submit" value="Search"/>
-  <a href="help.html">ajuda</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/pt/help.xml
===================================================================
--- src/web/pages/pt/help.xml	(revision 959954)
+++ src/web/pages/pt/help.xml	(working copy)
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-
-<page>
-
-<title>ajuda</title>
-
-<body>
-
-<h3>Pesquisas</h3>
-Para pesquisar com Nutch, apenas digite algumas poucas palavras.
-<ul>
-  <li>Resultados incluem apenas pginas que contm <span style="font-style: italic;">todas</span> as palavras da pesquisa.</li>
-  <li>Use aspas em volta das palavras que devem aparecer umas ao lado da outra, como uma frase,
-por exemplo, <span style="font-weight: bold;">"Nova Zelndia"</span>.</li>
-  <li>Pontos entre palavras tambm geram resultados baseados em frases.  Logo,
-pesquisar por <span style="font-weight: bold;">http://www.apache.org/</span>
- o mesmo que pesquisar por <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Pesquisar no so sensveis  letras maisculas ou minsculas, ento, pesquisar por <span style="font-weight: bold;">NuTcH</span>  o mesmo que pesquisar por <span style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Voc pode excluir termos da sua pesquisar colocando o sinal de menos
-antes dele, por exemplo, pesquisar por <span style="font-weight: bold;">futebol
--cbf</span> vai localizar pginas que falam sobre futebol mas no usam o termo
- "cbf".</li>
-  <li> isso a!</li>
-</ul>
-<h3>Resultados</h3>
-Cada pgina localizada contm os seguintes links:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">arquivado</span>) apresenta
-a verso da pgina que est no arquivo do Nutch.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explicao</span>) apresenta
-uma explicao sobre como esta pgina foi pontuada.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">semelhantes</span>) apresenta
-uma lista dos resultados que tm links para esta pgina.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/pt/about.xml
===================================================================
--- src/web/pages/pt/about.xml	(revision 959954)
+++ src/web/pages/pt/about.xml	(working copy)
@@ -1,56 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-
-<page>
-
-<title>sobre</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Crditos</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch  o incio de um esforo para implementar um sistema de
-localizao na web com cdigo-fonte aberto.</p>
-
-<p>Apesar do nmero de sistemas de localizao na web estarem diminundo,
-a localiazo na web  um requisito bsico para a navegao na internet.
-Os atuais oligoplios podero, em breve, vir a tornar monoplios, tendo
-uma nica empresa controlando praticamente todo o sistema de localiazo
-na web, visando seus interesses comerciais. Isso no seria bom para os
-usurios da internet.</p>
-
-<p>Nutch oferece uma alternativa transparente aos sistemas comerciais de
-localizao na web. Somente os resultados gerados por sistemas de
-localizao feitos com cdigo-fonte aberto podem ser inteiramente
-confiveis quanto a no serem direcionados (ou, ao menos, sua
-orientao  pblica). Todos os principais sistemas de localizao
-existentes tm frmulas de ranking prprias e no vo explicar porque
-foi dado um ranking a um determinado resultado. Alm disso, alguns
-sistemas de localizao determinam em que locais posicionar os resultados
-baseados mais em pagamentos do que nos mritos deles mesmos. Ao contrrio
-disso, Nutch no tem nada a esconder e nenhum motivo para direcionar seus
-resultados de nenhuma maneira exceto o de tentar dar a cada usurio os
-melhores resultados possveis.</p>
-
-<p>Nutch espera permitir que qualquer possa desenvolver um sistema de
-localiazo na web com tecnologia de nvel mundial, de maneira fcil e
-com custos reduzidos. Isso  um grande desafio. Para ter sucesso, o
-software Nutch tem que ser capaz de:</p>
-
-<ul>
-  <li>localizar as muitas bilhes de pginas existentes pro mes</li>
-  <li>manter um ndice dessas pginas</li>
-  <li>pesquisar esse ndice mais de 1000 vezes por segundo</li>
-  <li>prover resultados de alta qualidade</li>
-  <li>operar com o menor custo possvel</li>
-</ul>
-
-<p>Esta  uma prosta desafiadora.  Se voc acredita nos mritos desse
-projeto, por favor ajude, ou como um
-<a href="http://www.apache.org/dev/">desenvolvedor</a> ou com uma
-<a href="http://www.apache.org/foundation/contributing.html">doao</a>.</p>
-
-</body>
-</page>
Index: src/web/pages/ca/search.xml
===================================================================
--- src/web/pages/ca/search.xml	(revision 959954)
+++ src/web/pages/ca/search.xml	(working copy)
@@ -1,12 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="ca"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Cerca"/>
-  <a href="help.html">Ajuda</a>
-</form>
-</center>
-</body>
-</page>
-
Index: src/web/pages/ca/help.xml
===================================================================
--- src/web/pages/ca/help.xml	(revision 959954)
+++ src/web/pages/ca/help.xml	(working copy)
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<page>
-
-	<title>Ajuda per la cerca</title>
-
-<body>
-
-	<h3>Consultes</h3>
-	
-	Per cercar amb Nutch, simplement escriu algunes paraules.
-<ul>
-	<li>Els resultats noms inclouen pgines que contenen <span
- style="font-style: italic;">totes</span> les paraules de la consulta.</li>
-  <li>Fes servir cometes per indicar les paraules que han d'aparixer, forosament, una al costat de l'altra, com una frase,
-p.e., <span style="font-weight: bold;">"pilar de sis"</span>.</li>
-  <li>La puntuaci entre paraules tamb provoca que es cerqui com una frase. Aix, cercar <span style="font-weight: bold;">http://www.apache.org/</span> s el mateix que cercar <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Les cerques no diferencien entre minscules i majscules, d'aquesta manera, cercar <span
- style="font-weight: bold;">NuTcH</span> s el mateix que cercar <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Es pot prohibir una paraula de les pgines que resulten de la cerca posant un signe menys davant seu, p.e., cercar <span style="font-weight: bold;">futbol -lliga</span> buscar pgines que parlin de futbol per que no emprin la paraula "lliga".</li>
-  <li>Aix s tot!</li>
-</ul>
-<h3>Resultats</h3>
-Cada plana coincident resultat de la cerca t els segents enllaos:
-<ul>
-	<li>(<span style="color: rgb(51, 51, 255);">en memria</span>) mostra la versi de la plana que s'ha baixat Nutch.</li>
-	<li>(<span style="color: rgb(51, 51, 255);">explicaci</span>) mostra una explicaci de com s'ha puntuat aquesta plana</li>
-  <li>(<span style="color: rgb(51, 51, 255);">ncores</span>) mostra la llista d'enllaos d'entrada indexats per aquesta pgina.</li>
-</ul>
-
-</body>
-</page>
-
Index: src/web/pages/ca/about.xml
===================================================================
--- src/web/pages/ca/about.xml	(revision 959954)
+++ src/web/pages/ca/about.xml	(working copy)
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<page>
-
-	<title>Quan a</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Credits</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch s un nou projecte per implementar un cercador Web amb codi obert.</p>
-
-<p>La cerca al Web s un requeriment bsic per la navegaci a Internet, malgrat aquest fet, el nombre de cercadors disminueix dia a dia. L'oligopoli actual es pot convertir, aviat, en un monopoli amb una nica companyia controlant gariab totes les cerques que es facin al Web pels seus interessos comercials.</p>
-
-<p>Nutch proporciona una alternativa transparent als motors de cerca comercials. Noms els resultats de cerca obtinguts a partir d'un sistema fet amb codi obert es poden considerar realment fiables i no esbiaxats. Tots els grans cercadors comercials tenen frmules de rnquing propietries, i no explicaran mai perqu una pgina en particular surt en una determinada posici del rnquing. A ms, alguns cercadors determinien quins llocs Web indexar en funci de pagaments i no pas en funci dels mrits intrnsecs del lloc Web. Nutch, per la seva banda, no t res a amagar ni cap motiu per prioritzar esbiaxadament els seus resultats o el seu crawler, i noms busca proporcionar a cada usuari els millors resultats de cerca possibles.</p>
-
-<p>L'objectiu de Nutch s permetre a qualsevol persona muntar un cercador Web de primera classe de forma barata i efectiva. Per aconseguir-ho, el programari Nutch ha de capa de:</p>
-<ul>
-	<li>Recollir diversos milers de millions de pgines Web mensualment.</li>
-	<li>Mantenir un ndex d'aquestes pgines.</li>
-	<li>Cercar en aquest ndex fins a 1000 vegades per segon.</li>
-	<li>Proporcionar resultats de cerca d'alta qualitat.</li>
-	<li>Operar amb un cost mnim.</li>
-</ul>
-
-<p>Aquesta proposta s sens dubte, un repte. Si creus en els mrits d'aquest projecte, si us plau, ajuda-n's, tant com a <a href="http://www.apache.org/dev/">desenvolupador</a> o amb una <a href="http://www.apache.org/foundation/contributing.html">donaci</a>
-.</p>
-
-</body>
-</page>
-
Index: src/web/pages/sr/search.xml
===================================================================
--- src/web/pages/sr/search.xml	(revision 959954)
+++ src/web/pages/sr/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="sr"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Претражи"/>
-  <a href="help.html">помоћ</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/index.html">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/sr/help.xml
===================================================================
--- src/web/pages/sr/help.xml	(revision 959954)
+++ src/web/pages/sr/help.xml	(working copy)
@@ -1,37 +0,0 @@
-<page>
-
-<title>помоћ</title>
-
-<body>
-
-<h3>Упити</h3>
-Да бисте претраживали помоћу Nutch-а, само укуцајте неколико речи.
-<ul>
-  <li>Резултати представљају само оне странице које садрже <span
- style="font-style: italic;">све</span> речи из упита.</li>
-  <li>Користите наводнике око речи које се морају појављивати заједно, као фраза,
-нпр., <span style="font-weight: bold;">"Нови сад"</span>.</li>
-  <li>Интерпункција између речи такође изазива претраживање по фразама. Тако да је
-претраживање по <span style="font-weight: bold;">http://www.apache.org/</span>
-идентично као претраживање по <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Претраживање не разликује велика и мала слова у упиту, тако да је претраживање по <span
- style="font-weight: bold;">NuTcH</span> идентично као претраживање по <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Да бисте искључили из резултата странице које садрже неки термин ставите знак минус испред тог термина,
- нпр., претраживање по <span style="font-weight: bold;">фудбал
--нфл</span> ће пронаћи странице које за тему имају фудбал, али не користе реч "нфл".</li>
-  <li>То је то!</li>
-</ul>
-<h3>Резултати</h3>
-Свака страница у резултатима има следеће линкове:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">кеширано</span>) приказује верзију странице
-коју је Nutch дохватио.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">објасни</span>) приказује објашњење
- како је ова страница бодована.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">везе</span>) приказује списак одредница коришћених да се направи линк ка овој страни.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/sr/about.xml
===================================================================
--- src/web/pages/sr/about.xml	(revision 959954)
+++ src/web/pages/sr/about.xml	(working copy)
@@ -1,46 +0,0 @@
-﻿<page>
-
-<title>О Nutch-у</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Заслуге</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Статус</a></item>
-</menu>
-
-<body>
-
-<p>Nutch је подухват имплементирања web претраживача отвореног кода.</p>
-
-<p>Web претраживање је основни предуслов за интернет навигацију, а ипак се
-број web претраживача смањује. Олигопол данашњице могао би ускоро лако постати 
-монопол, са једном компанијом која контролише готово сва претраживања
-ради сопствене финансијске добити. По кориснике интернета то не би било добро.</p>
-
-<p>Nutch нуди транспарентну алтернативу комерцијалним web претраживачима.
-Само претраживачима отвореног кода можемо веровати да су њихови резултати непристрасни.
-(Или је барем њихова пристрасност јавна.) Сви главни постојећи претраживачи 
-имају сопствене формуле рангирања резултата, и не желе објашњавати зашто се одређена страница
-рангира баш тако. Такође, неким web претраживачима могуће је платити како би се утицало на то
-који сајт и на који начин ће бити рангиран у резултатима, уместо да услов за рангирање буде сам сајт.
-Nutch, са друге стране, нема шта да крије и нема мотива да његови резултати или његов crawler буду пристрасни у било ком
-погледу осим да покуша да сваком кориснику понуди најбоље могуће резултате.</p>
-
-<p>Циљ Nutch-а је да омогући свакоме да једноставно и економично развије 
-квалитетан web претраживач. То је велики изазов. Да би успео, Nutch 
-софтвер мора бити спосабан да:</p>
-<ul>
-  <li>дохвата неколико милијарди страница месечно</li>
-  <li>одржава индекс тих страница</li>
-  <li>претражује индекс тих страница и до 1000 пута у секунди</li>
-  <li>обезбеди висококвалитетне резултате претраге</li>
-  <li>има минималне оперативне трошкове</li>
-</ul>
-
-<p>То је изазован подухват. Ако верујеш у суштину овог пројекта, можеш помоћи 
-било својим радом на <a
-href="http://www.apache.org/dev/">развоју</a> било <a
-href="http://www.apache.org/foundation/contributing.html">донацијом.</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/sv/search.xml
===================================================================
--- src/web/pages/sv/search.xml	(revision 959954)
+++ src/web/pages/sv/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="sv"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Search"/>
-  <a href="help.html">hjälp</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/index.html">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/sv/help.xml
===================================================================
--- src/web/pages/sv/help.xml	(revision 959954)
+++ src/web/pages/sv/help.xml	(working copy)
@@ -1,35 +0,0 @@
-<page>
-
-<title>sökhjälp</title>
-
-<body>
-
-<h3>Sökningar</h3>
-För att söka med Nutch, mata in några få ord.
-<ul>
-  <li>Resultat inkluderar endast sidor som innehåller <span
- style="font-style: italic;">alla</span> sökord.</li>
-  <li>Använd citattecken runt flera ord som måste ligga bredvid varandra, som en fras, t.ex. <span style="font-weight: bold;">"Nya Zeeland"</span>.</li>
-  <li>Frasmatchning aktiveras också av skiljetecken mellan ord. Så en sökning efter <span style="font-weight: bold;">http://www.apache.org/</span>
-är samma sak som en sökning efter <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>I sökningar görs ingen skillnad på stora och små bokstöver, så att söka efter <span
- style="font-weight: bold;">NuTcH</span> är samma sak som att söka efter <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Du kan förhindra att ett ord eller uttryck kommer med bland sökresultaten genom att sätta ett minus- eller bindestreckstecken framför det. Om du t.ex. söker efter <span style="font-weight: bold;">fotboll
--uefa</span> så kommer sidor som handlar om fotboll att hittas, men kommer inte att innehålla order "uefa".</li>
-  <li>Det var allt!</li>
-</ul>
-<h3>Resultat</h3>
-Varje matchningssida bland resultaten har följande länkar:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">cashad</span>) visar
-versionen av sidan som Nutch laddat ner.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">förklara</span>) visar
-en förklaring till hur den här sidan rankats.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">ankare</span>) visar
-en lista av inkommande ankare indexerade för denna sidan.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/sv/about.xml
===================================================================
--- src/web/pages/sv/about.xml	(revision 959954)
+++ src/web/pages/sv/about.xml	(working copy)
@@ -1,46 +0,0 @@
-<page>
-
-<title>om</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Erkännanden</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch är ett nytt initiativ att implementera en Open-Source-sökmotor för webben.</p>
-
-<p>Webbsökning är ett grundläggande krav för att kunna navigera på Internet,
-trots det håller antalet sökmotorer på att minska. Dagens oligopol skulle
-snart kunna bli ett monopol, med ett enda företag som kontrollerar nästan all
-webbsökning för dess egna kommersiella vinst. Det skulle inte vara bra för
-Internets användare.</p>
-
-<p>Nutch tillhandahåller ett transparent alternativ till kommersiella sökmotorer.
-Endast Open Source-sökresultat kan litas på fullt ut att vara fördomsfria. (Eller
-att deras fördomar är officiella.) Alla stora existerande sökmotorer har skyddade
-rankingformler och vill inte förklara varför en viss sida rankas som den gör.
-Dessutom bestämmer vissa sökmotorer vilka sidor som ska indexeras beroende på
-betalning hellre än på sidors egna meriter. Nutch, å andra sidan, har inget att
-dölja och har inget motiv till att kasta fördom över dess resultat eller
-indexering på något sätt annat än att försöka ge varje användare de bästa
-sökresultaten som möjligt.</p>
-
-<p>Nutch avser att ge vem som helst möjlighet att enkelt och kostnadseffektivt
-lansera en sökmotor av världsklass. Detta är rejäl utmaning. För att lyckas måste
-Nutch-mjukvara kunna:</p>
-<ul>
-	<li>hämta flera miljarder sidor per månad</li>
-	<li>underhålla ett index på dessa sidor</li>
-	<li>söka detta index upp till 1000 gånger per sekund</li>
-	<li>tillhandahålla mycket högkvalitativa sökresultat</li>
-	<li>drivas till minimal kostnad</li>
-</ul>
-
-<p>Detta är ett utmanande förslag. Om du tror på detta projekts förtjänster,
-hjälp gärna till, antingen som en <a href="http://www.apache.org/dev/">utvecklare</a>
-eller med en <a href="http://www.apache.org/foundation/contributing.html">donation</a></p>
-
-</body>
-</page>
Index: src/web/pages/de/search.xml
===================================================================
--- src/web/pages/de/search.xml	(revision 959954)
+++ src/web/pages/de/search.xml	(working copy)
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="de"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Suchen"/>
-  <a href="help.html">Hilfe</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/de/help.xml
===================================================================
--- src/web/pages/de/help.xml	(revision 959954)
+++ src/web/pages/de/help.xml	(working copy)
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-
-<title>Hilfe zur Suche</title>
-
-<body>
-
-<h3>Suchanfragen</h3>
-Nutch ben&ouml;tigt nur wenige W&ouml;rter, um Ihnen zufriedenstellende Resultate zu liefern..
-<ul>
-  <li>Die Resultate enthalten nur Seiten, welche <em>alle </em>Suchw&ouml;rter 
-    enthalten.</li>
-  <li>Benutzen Sie Anf&uuml;hrungs und Schlusszeichen um W&ouml;rter, welche als 
-    Satzteil in exact dieser Folge vorkommen m&uuml;ssen. Z.B. <strong>&quot;Thomas 
-    Jefferson&quot;</strong></li>
-  <li>Interpunktionszeichen werden wie Leerzeichen behandelt und trennen Wortgruppen 
-    in einzelne Suchw&ouml;rter.Wenn Sie also nach <strong>http://www.apache.org/</strong> 
-    suchen, entspricht dies einer Suche nach <strong>&quot;http www apache org&quot;</strong>.</li>
-  <li>Die Gross-/Kleinschreibung wird bei der Suchen nicht beachtet.</li>
-  <li>Sie k&ouml;nnen ein Wort explizit aus den Suchresultat ausschliessen indem 
-    Sie ein Minus vor dieses Sezten. Wenn Sie zum Beispiel <strong>&quot;Fussball 
-    -uefa&quot; </strong>eingeben, so wird nach allen Seiten mit dem Thema Fussball 
-    gessucht, welche aber das Wort &quot;uefa&quot; nicht enthalten.</li>
-  <li>Das ist alles!</li>
-</ul>
-<h3>Resultate</h3>
-<p>Jede resultierende Seite hat folgende Links:</p>
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">cached</span>) zeigt eine von Nutch 
-    zwischengespeicherte Version der Seite.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explain</span>) zeigt eine detaillierte 
-    Auflistung der Bewertung dieser Seite.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>) zeigt eine Liste 
-    von Seiten, welche diese Seite referenzieren.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/de/about.xml
===================================================================
--- src/web/pages/de/about.xml	(revision 959954)
+++ src/web/pages/de/about.xml	(working copy)
@@ -1,61 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-
-<title>&Uuml;ber</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Danksagungen</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>
-&quot;Nutch&quot; ist der Versuch, eine Suchmaschine auf Open-Source-Basis zu implementieren.
-</p>
-
-<p>
-Suchmaschinen dienen heute als Grundlage zur Navigation im Internet.
-Trotz der hohen Nachfrage gibt es immer weniger davon. Bald knnte sich
-das bestehende Oligopol zu einem Monopol entwickeln. Eine einzelne Firma 
-wrde fast alle Internet-Suchen und damit den Zugriff zu Informationen 
-kontrollieren - kein wnschenswertes Szenario fr Internet-Nutzer.
-</p>
-
-<p>
-&quot;Nutch&quot; stellt Ihnen eine transparente Alternative zu kommerziellen
-Suchmaschinen zur Verfgung. Nur von offenen Suchergebnissen knnen sie eine
-vollkommene Unvoreingenommenheit erwarten (Oder zumindest knnen Sie Ihre
-Voreingenommenheit ffentlich einsehen). Alle bestehenden Suchmaschinen haben propriet&auml;re
-Bewertungsalgorithmen. Sie knnen nicht nachvollziehen, warum Ihre Seite ein gewisses
-Ranking bekommt. Zudem erlauben es einige Suchmaschinen, gute Positionen zu erkaufen.
-&quot;Nutch&quot; hingegen hat nichts zu verbergen und keine Motivation, seine Resultate
-zu verflschen.
-&quot;Nutch&quot;s einziges Ziel ist es, den Benutzer mit den bestmglichen Suchresultaten 
-zu versorgen.
-</p>
-
-
-<p>
-&quot;Nutch&quot; ziehlt darauf ab, jedem die Mglichkeit zu geben, kosteneffizient 
-und auf einfache Art und Weise eine Suchmaschine einzurichten. Um erfolgreich zu sein,
-muss &quot;Nutch&quot; Folgendes leisten knnen:
-</p>
-<ul>
-  <li>Mehrere Millionen Webseiten pro Monat indexieren</li>
-  <li>Durchsuchen der indexierten Seiten bis zu 1000 mal pro Sekunde</li>
-  <li>Qualitativ hochwertige Suchresultate zur Verfgung stellen</li>
-  <li>Minimalste Kosten verursachen</li>
-</ul>
-
-<p>
-Dieses Vorhaben ist eine grosse Herausforderung, und wenn Sie wie wir an die 
-Vorzge eines solchen Projektes glauben, so bitten wir Sie uns zu untersttzen - 
-entweder als <a href="http://www.apache.org/dev/">Entwickler</a> oder durch eine <a href="http://www.apache.org/foundation/contributing.html">Spende</a>.
-</p>
-
-</body>
-</page>
Index: src/web/pages/fi/search.xml
===================================================================
--- src/web/pages/fi/search.xml	(revision 959954)
+++ src/web/pages/fi/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="fi"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Hae"/>
-  <a href="help.html">Apua</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/fi/help.xml
===================================================================
--- src/web/pages/fi/help.xml	(revision 959954)
+++ src/web/pages/fi/help.xml	(working copy)
@@ -1,41 +0,0 @@
-<page>
-
-<title>apua</title>
-
-<body>
-
-<h3>Kyselyt</h3>
-Etsiäksesi Nutchilla kirjoita vain muutama hakusana.
-<ul>
-  <li>Tulokset sisältävät vain sivuja, joilla
-      <span style="font-style: italic;">kaikki</span> hakusanat
-      esiintyvät.</li>
-  <li>Käytä lainausmerkkejä sellaisten sanojen ympärillä, joiden pitää esiintyä
-      yhtenäisenä ryppäänä. Esimerkiksi:
-      <span style="font-weight: bold;">"Yhdistyneet kansakunnat"</span>.</li>
-  <li>Välimerkit sanojen välissä katkaisevat hakutermin. On samantekevää etsitkö
-      hakusanalla <span style="font-weight: bold;">http://www.apache.org/</span>
-      vai sanoilla <span style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>Haut eivät katso kirjaimen kokoon, eli hakusana
-      <span style="font-weight: bold;">NuTcH</span> on sama kuin
-      <span style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Voit estää hakusanaa ilmaantumasta tuloksissa liittämällä sen eteen
-      miinus-merkin; <span style="font-weight: bold;">jalkapallo -nfl</span>
-      etsii sivut joilla puhutaan jalkapallosta mutta jotka eivät käytä sanaa
-      "nfl".</li>
-  <li>Se siitä!</li>
-</ul>
-
-<h3>Tulokset</h3>
-Kukin hakutulos tuottaa seuraavat linkit:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">välimuistissa</span>) näyttää
-      Nutchin lataaman version sivusta.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">selitä</span>) selittää miksi sivu
-      sijoitettiin tällä tavalla.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">linkit</span>) näyttää sivulle
-      viittaavat luetteloidut linkit.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/fi/about.xml
===================================================================
--- src/web/pages/fi/about.xml	(revision 959954)
+++ src/web/pages/fi/about.xml	(working copy)
@@ -1,45 +0,0 @@
-<page>
-
-<title>projektista</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Tekijät</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Projektin tila</a></item>
-</menu>
-
-<body>
-
-<p>Nutch on syntyvaiheessa oleva avoimen lähdekoodin verkkohakukone.</p>
-
-<p>Verkkohaku on internet-navigaation perusehto, mutta siitä huolimatta
-hakukoneiden lukumäärä on vähenemässä. Nykyinen oligopoli voi pian muuttua
-monopoliksi, jossa yksi yhtiö hallitsee lähes kaikkia verkkohakuja, omaksi
-taloudelliseksi hyödykseen. Tämä ei olisi hyväksi Internetin käyttäjille.</p>
-
-<p>Nutch tarjoaa läpinäkyvän vaihtoehdon kaupallisille hakukoneille. Vain
-avoimeen lähdekoodiin perustuvien hakutulosten puolueettomuuteen voidaan
-luottaa. (Tai ainakin niiden puolueellisuus on julkista.) Kaikkien
-merkittävien olemassa olevien hakukoneiden järjestelyperusteet ovat salaisia,
-eivätkä koneiden tekijät selitä miksi tietty sivu saa tietyn sijoituksen.
-Lisäksi tietyt hakukoneet päättävät luetteloimistaan sivuista maksun
-perusteella, luetteloitavien sivujen hyvyyden asemesta. Nutchilla taas ei ole
-mitään salattavaa tai syytä vääristellä hakukoneensa tuloksia, tai robottinsa
-toimintaa, ellei seurauksena ole parempia hakutuloksia käyttäjille.</p>
-
-<p>Nutch yrittää mahdollistaa sen, että kaikki voivat helposti ja halvalla tuoda
-näkyville maailmanluokan hakukoneen. Tämä on mittava haaste. Onnistuakseen
-Nutchin täytyy pystyä:</p>
-<ul>
-  <li>noutamaan useita miljardeja sivuja kuukaudessa</li>
-  <li>ylläpitämään hakemistoa näistä sivuista</li>
-  <li>hakemaan hakemistosta tuhatkunta kertaa sekunnissa</li>
-  <li>tarjoamaan erittäin korkeatasoisia hakutuloksia</li>
-  <li>toimimaan mahdollisimman alhaisin kustannuksin</li>
-</ul>
-
-<p>Ongelma on haastava. Jos uskot projektin mahdollisuuksiin voit auttaa joko sen
-<a href="http://www.apache.org/dev/">kehittämisessä</a> tai <a
-href="http://www.apache.org/foundation/contributing.html">lahjoittamalla sille varoja</a>.</p>
-
-</body>
-</page>
Index: src/web/pages/en/search.xml
===================================================================
--- src/web/pages/en/search.xml	(revision 959954)
+++ src/web/pages/en/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="en"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Search"/>
-  <a href="help.html">help</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/index.html">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/en/help.xml
===================================================================
--- src/web/pages/en/help.xml	(revision 959954)
+++ src/web/pages/en/help.xml	(working copy)
@@ -1,39 +0,0 @@
-<page>
-
-<title>search help</title>
-
-<body>
-
-<h3>Queries</h3>
-To search with Nutch, just type in a few words.
-<ul>
-  <li>Results only include pages that contain <span
- style="font-style: italic;">all</span> of the query words.</li>
-  <li>Use quotes around words that must occur adjacently, as a phrase,
-e.g., <span style="font-weight: bold;">"New Zealand"</span>.</li>
-  <li>Punctuation between words also triggers phrase matching.  So
-searching for <span style="font-weight: bold;">http://www.apache.org/</span>
-is the same as searching for <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Searches are not case-sensitive, so searching for <span
- style="font-weight: bold;">NuTcH</span> is the same as searching for <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>You can prohibit a term from resulting pages by putting a minus
-before it, e.g., searching for <span style="font-weight: bold;">football
--nfl</span> will find pages that discuss football, but don't use the
-word "nfl".</li>
-  <li>That's it!</li>
-</ul>
-<h3>Results</h3>
-Each matching page in the results has the following links:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">cached</span>) displays
-the version of the page that Nutch downloaded.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explain</span>) displays
-an explanation of how this page scored.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>) shows the
-list of incoming anchors indexed for this page.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/en/about.xml
===================================================================
--- src/web/pages/en/about.xml	(revision 959954)
+++ src/web/pages/en/about.xml	(working copy)
@@ -1,49 +0,0 @@
-<page>
-
-<title>about</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Credits</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch is a nascent effort to implement an open-source web search
-engine.</p>
-
-<p>Web search is a basic requirement for internet navigation, yet the
-number of web search engines is decreasing. Today's oligopoly could
-soon be a monopoly, with a single company controlling nearly all web
-search for its commercial gain.  That would not be good for users of
-the internet.</p>
-
-<p>Nutch provides a transparent alternative to commercial web search
-engines.  Only open source search results can be fully trusted to be
-without bias.  (Or at least their bias is public.)  All existing major
-search engines have proprietary ranking formulas, and will not explain
-why a given page ranks as it does.  Additionally, some search engines
-determine which sites to index based on payments, rather than on the
-merits of the sites themselves.  Nutch, on the other hand, has nothing
-to hide and no motive to bias its results or its crawler in any way
-other than to try to give each user the best results possible.</p>
-
-<p>Nutch aims to enable anyone to easily and cost-effectively deploy a
-world-class web search engine.  This is a substantial challenge.  To
-succeed, Nutch software must be able to:</p>
-<ul>
-  <li>fetch several billion pages per month</li>
-  <li>maintain an index of these pages</li>
-  <li>search that index up to 1000 times per second</li>
-  <li>provide very high quality search results</li>
-  <li>operate at minimal cost</li>
-</ul>
-
-<p>This is a challenging proposition.  If you believe in the merits of
-this project, please help out, either as a <a
-href="http://www.apache.org/dev/">developer</a> or with a <a
-href="http://www.apache.org/foundation/contributing.html">donation</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/fr/search.xml
===================================================================
--- src/web/pages/fr/search.xml	(revision 959954)
+++ src/web/pages/fr/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="fr"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Recherche"/>
-  <a href="help.html">Aide</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/fr/help.xml
===================================================================
--- src/web/pages/fr/help.xml	(revision 959954)
+++ src/web/pages/fr/help.xml	(working copy)
@@ -1,38 +0,0 @@
-<page>
-
-<title>aide</title>
-
-<body>
-
-<h3>Requêtes</h3>
-Pour effectuer une recherche avec Nutch, il suffit de saisir quelques mots.
-<ul>
-  <li>Les résultats contiendront uniquement les pages qui contiennent
-<span style="font-style: italic;">tous</span> les mots de la question.</li>
-  <li>Utilisez les doubles chevrons autour des termes qui doivent être adjacents,
-comme dans le cas d'une phrase. Par exemple <span style="font-weight: bold;">
-"Nouvelle Zélande"</span>.</li>
-  <li>La ponctuation entre les mots est ignorée. Ainsi, la recherche 
-<span style="font-weight: bold;">http://www.apache.org/</span> est équivalente à
-<span style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>La recherche n'est pas sensible à la casse. Ainsi, la recherche
-<span style="font-weight: bold;">NuTcH</span> est équivalente à <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Vous pouvez exclure un mot des résultats en plaçant un signe moins
-devant. Ainsi, la recherche <span style="font-weight: bold;">football
--nfl</span> retournera les pages qui parlent de football, mais qui ne contiennent
-pas le mot "nfl".</li>
-</ul>
-<h3>Résultats</h3>
-Chaque page affichée dans la liste des résultats propose les liens suivants:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">En cache</span>) affiche la version
-de la page téléchargée par Nutch.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">Explications</span>) affiche des explications
-sur la manière dont le score de la page a été calculé.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">Ancres</span>) affiche la liste des documents
-indexés pointant vers cette page.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/fr/about.xml
===================================================================
--- src/web/pages/fr/about.xml	(revision 959954)
+++ src/web/pages/fr/about.xml	(working copy)
@@ -1,42 +0,0 @@
-<page>
-
-<title>A propos de Nutch</title>
-
-<body>
-
-<p>Le projet Nutch a pour but le développement d'un moteur de
-recherche open source pour le web.</p>
-
-<p>Bien que les moteurs de recherche soient indispensables pour naviguer sur Internet, 
- leur nombre est en diminution. Ce qui aujourd'hui est un oligopole pourrait
- se changer rapidement en monopole, où une seule entreprise contrôlerait presque
- toute la recherche sur le web pour son profit commercial. Cela ne serait pas bon pour 
- les utilisateurs d'Internet.</p>
-  
-<p>Nutch constitue une alternative transparente aux moteurs de recherche commerciaux.
-  Seuls les résultats d'un système de recherche open source peuvent être garantis comme n'étant pas
-  faussés (ou du moins le biais serait publique). Tous les principaux moteurs de recherche 
-  ont des formules de classement propriétaires et n'expliqueront jamais pourquoi telle 
-  ou telle page a été classée d'une certaine façon. De plus, certains moteurs de recherche
-  choisissent les sites à indexer en échange d'une rémunération plus que sur la valeur 
-  intrinsèque du site. Nutch, lui, n'a rien à cacher et n'a aucune raison de fausser  
-  ses résultats ou son crawling si ce n'est pour donner à chaque utilisateur les meilleurs 
-  résultats possibles.</p>
-
-<p>Nutch a pour but de permettre à quiconque de mettre en place un moteur de recherche web 
-facilement et à moindre frais. Ce n'est pas une mince affaire. Pour y parvenir, le logiciel Nutch doit
- être capable de :</p>
-
-<ul>
-  <li>récupérer plusieurs milliards de pages par mois</li>
-  <li>maintenir un index de ces pages</li>
-  <li>faire des recherches sur cet index jusqu'à un millier de fois par seconde</li>
-  <li>offrir des résultats de très haute qualité</li>
-  <li>fonctionner à moindre coût</li>
-</ul>
-
-<p>C'est un défi de taille.  Si vous croyez dans les mérites de ce projet, merci d'y contribuer en 
-tant que <a href="http://www.apache.org/dev/">développeur</a> ou bien en <a href="http://www.apache.org/foundation/contributing.html">faisant un don</a>.
-</p>
-</body>
-</page>
Index: src/web/pages/es/search.xml
===================================================================
--- src/web/pages/es/search.xml	(revision 959954)
+++ src/web/pages/es/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="es"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Buscar"/>
-  <a href="help.html">ayuda</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/es/help.xml
===================================================================
--- src/web/pages/es/help.xml	(revision 959954)
+++ src/web/pages/es/help.xml	(working copy)
@@ -1,33 +0,0 @@
-<page>
-
-<title>Ayuda para la búsqueda</title>
-
-<body>
-
-<h3>Búsquedas</h3>
-Para buscar con Nutch, simplemente tipea unas palabras.
-<ul>
-  <li>Los resultados solamente incluyen páginas que contienen <span
- style="font-style: italic;">todas</span> las palabras buscadas.</li>
-  <li>Usa comillas a ambos lados de grupos de palabras que deben 
-aparecer en forma adyacente, como una frase, por ejemplo <span style="font-weight: bold;">"Nueva Zelanda"</span>.</li>
-  <li>El uso de puntuación entre palabras también resulta en la búsqueda 
- de una frase. Por lo tanto, buscar <span style="font-weight: bold;">http://www.apache.org/</span>
-es lo mismo que buscar <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>La búsqueda no diferencia entre mayúsculas y minúsculas, por lo que buscar <span
- style="font-weight: bold;">NuTcH</span> es equivalente a buscar <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Se pueden descartar resultados con una palabra dada colocando un signo menos (-) delante de ésta, por ejemplo la búsqueda <span style="font-weight: bold;">deportes -golf</span> hallará páginas acerca de deportes que no contengan la palabra "golf".</li>
-  <li>Eso es todo.</li>
-</ul>
-<h3>Resultados</h3>
-Cada página que aparece en los resultados tiene los siguientes enlaces:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">en caché</span>) muestra la versión de la página visitada por Nutch.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explicar</span>) muestra una explicación del puntaje de esta página.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>) muestra la lista del texto que aparece en enlaces que apuntan a esta página.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/es/about.xml
===================================================================
--- src/web/pages/es/about.xml	(revision 959954)
+++ src/web/pages/es/about.xml	(working copy)
@@ -1,54 +0,0 @@
-<page>
-
-<title>Acerca de...</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Créditos</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch es un esfuerzo naciente para implementar un buscador de web de 
-fuentes abiertos (open-source).</p>
-
-<p>La búsqueda en la web es un requisito básico para la navegación 
-de internet. Sin embargo, el número de motores de búsqueda de web 
-se encuentra en disminución. El oligopolio actual podría convertirse en 
-un monopolio en un futuro próximo, con una única empresa que controlaría 
-cerca de la totalidad de las búsquedas de web para su beneficio comercial. 
-Esto no sería beneficioso para los usuarios de internet.</p>
-
-<p>Nutch provee una alternativa transparente en relación a los buscadores 
-comerciales. Solamente los resultados de un buscador open-source pueden 
-ser considerados imparciales (o al menos, su parcialidad es pública). Todos 
-los buscadores líderes tienen fórmulas propietarias para generar su 
-ranking de resultados y no revelan por qué una página determinada ocupa 
-una cierta posición en la lista. Además, algunos buscadores deciden 
-qué sitios incluir en su índice en base a pagos, en lugar de basarse en 
-el mérito propio de cada sitio. Nutch, por otro lado, no tiene nada que 
-ocultar ni motivos para sesgar sus resultados o las páginas visitadas 
-por su araña (crawler) en forma alguna, más allá de la intención de 
-proporcionar a cada usuario los mejores resultados posibles.</p>
-
-<p>Nutch apunta a permitir que cualquiera pueda implementar un motor 
-de búsqueda con tecnología de nivel mundial en forma sencilla y eficiente 
-en cuanto a los costos. Este es un desafío significativo. Para tener éxito, 
-el software de Nutch debe ser capaz de:</p>
-
-<ul>
-  <li>visitar miles de millones de páginas cada mes</li>
-  <li>mantener un índice de dichas páginas</li>
-  <li>buscar en ese índice hasta mil veces por segundo</li>
-  <li>proveer resultados de búsqueda de muy alta calidad</li>
-  <li>operar a un costo mínimo</li>
-</ul>
-
-<p>Esta es una propuesta ambiciosa. Si te convencen los méritos de
-este proyecto, te pedimos que colabores, ya sea como <a
-href="http://www.apache.org/dev/">desarrollador</a> o bien a través de una <a
-href="http://www.apache.org/foundation/contributing.html">donación</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/nl/search.xml
===================================================================
--- src/web/pages/nl/search.xml	(revision 959954)
+++ src/web/pages/nl/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="nl"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Zoeken"/>
-  <a href="help.html">Help</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/nl/help.xml
===================================================================
--- src/web/pages/nl/help.xml	(revision 959954)
+++ src/web/pages/nl/help.xml	(working copy)
@@ -1,43 +0,0 @@
-<page>
-
-<title>Help voor het zoeken</title>
-
-<body>
-
-<h3>Queries</h3>
-Type een paar woorden in om met Nuch te zoeken.
-<ul>
-  <li>Resultaten bevatten alleen pagina's die
-  <span style="font-style: italic;">alle</span> query woorden bevatten.</li>
-  <li>Gebruik quotes om woorden die achter elkaar moeten voorkomen, als een phrase,
-  b.v.  <span style="font-weight: bold;">"Nieuw Zeeland"</span>.</li>
-  <li>'Punktuering' tussen woorden (punten, komma's, dubbele punten ed.)
-  leidt ook tot het zoeken naar phrasen. Dus zoeken naar
-   <span style="font-weight: bold;">http://www.apache.org/</span>
-    is hetzelfde als zoeken naar
-    <span style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>Zoeken is niet kast-gevoelig, dus zoeken naar
-  <span style="font-weight: bold;">NuTcH</span>
-  is hetzelfde als zoeken naar
-  <span style="font-weight: bold;">nUtCh</span>.</li>
-  <li>U kunt een term in de resulterende pagina's voorkomen
-  door een min-streepje ervoor te zetten, b.v. zoeken naar
-<span style="font-weight: bold;">voetbal -knvb</span>
-zal pagina's vinden die over voetbal gaan, maar het woord "knvb" niet bevatten.
-</li>
-  <li>Dat is alles!</li>
-</ul>
-<h3>Resultaten</h3>
-Iedere gevonden pagina in de resultaten heeft de volgende links:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">opgeslagen</span>)
-  geeft de versie van de pagina weer die Nutch gedownload heeft.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">uitleg</span>)
-  geeft een uitleg weer over hoe deze pagina scoorde.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">verwijzers</span>)
-  laat een lijst zien van pagina's die naar deze pagina verwijzen.
-  </li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/nl/about.xml
===================================================================
--- src/web/pages/nl/about.xml	(revision 959954)
+++ src/web/pages/nl/about.xml	(working copy)
@@ -1,53 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<page>
-
-
-<title>Over Nutch</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Credits</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch heeft een begin gemaakt met het realiseren van een open-source webzoekmachine.</p>
-
-<p>Het doorzoeken van het web vormt een basiseis voor het surfen over het internet. Toch vermindert
-het aantal zoekmachines. De huidige markt met enkele aanbieders kent binnenkort wellicht nog maar
-n enkele aanbieder. Die ene onderneming krijgt dan de macht over bijna alle zoekopdrachten en
-kan deze gebruiken voor haar commercile doeleinden. Dat zou niet goed zijn voor de internetgebruikers.
-</p>
-
-<p>Nutch biedt een transparant alternatief voor de commercile webzoekmachines.
-Alleen bij open-source-zoekresultaten valt  er volledig op te vertrouwen dat zij zonder dubieuze
-filters zijn geselecteerd (of is hun filtering tenminste voor iedereen zichtbaar).
-Alle huidige belangrijke zoekmachines hanteren eigen formules
-voor de rangorde van treffers en leggen niet uit hoe een bepaalde pagina aan zijn plaats in de lijst van
-treffers komt.
-Bovendien bepalen sommige zoekmachines hun indexering op basis van betalingen in plaats van
-op de verdiensten van de sites zelf.
-Nutch, daarentegen, heeft niets te verbergen en heeft geen reden om
-zijn zoekresultaten of zijn crawler op welke manier dan ook te benvloeden,
-behalve dan door elke gebruiker het best mogelijke resultaat te willen bieden.
-</p>
-
-<p>Nutch wil iedereen in staat stellen op eenvoudige en lonende wijze gebruik te maken van een zoekmachine
-van wereldklasse.
-Dit vormt een aanzienlijke uitdaging. Om erin te slagen dient de Nutch software in staat zijn om:</p>
-<ul>
-  <li>verscheidene miljarden pagina's per maand te bereiken; </li>
-  <li>een index van deze pagina's te onderhouden;</li>
-  <li>die index tot 1000 keer per seconde af te zoeken;</li>
-  <li>een zeer hoge kwaliteit van zoekresultaten te verschaffen;</li>
-  <li>tegen minimale kosten te functioneren.</li>
-</ul>
-
-<p>Dit voornemen vormt een uitdaging.
-Als je in het belang van dit project gelooft, werk er dan aan mee, als
-<a href="http://www.apache.org/dev/">ontwikkelaar</a> of als
-<a href="http://www.apache.org/foundation/contributing.html">donateur</a>.
-</p>
-
-</body>
-</page>
Index: src/web/pages/jp/search.xml
===================================================================
--- src/web/pages/jp/search.xml	(revision 959954)
+++ src/web/pages/jp/search.xml	(working copy)
@@ -1,11 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="jp"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="検索"/>
-  <a href="help.html">ヘルプ</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/jp/help.xml
===================================================================
--- src/web/pages/jp/help.xml	(revision 959954)
+++ src/web/pages/jp/help.xml	(working copy)
@@ -1,30 +0,0 @@
-<page>
-
-<title>検索のヘルプ</title>
-
-<body>
-
-<h3>検索</h3>
-Nutch で検索するために 2〜3語を入力してください。
-<ul>
-  <li>検索結果は検索語の <span
- style="font-style: italic;">全て</span> を含むページだけです。</li>
-  <li>例えば繋がって表記されなければいけない単語の場合は引用符で囲ってください。例：<span style="font-weight: bold;">"New Zealand"</span>.</li>
-  <li>単語の間のピリオドは、同じフレーズを示します。<span style="font-weight: bold;">http://www.apache.org/</span> を検索することと、<span style="font-weight: bold;">"http www apache org"</span> を検索するのは同じことです。</li>
-  <li>検索語は、大文字と小文字の区別ができません。 <span
- style="font-weight: bold;">NuTcH</span> を検索することと <span
- style="font-weight: bold;">nUtCh</span> を検索するのは同じことです。</li>
-  <li>単語の前にマイナス記号をつけることによって、その単語が含まれるページを検索対象から外すことができます。例：<span style="font-weight: bold;">football
--nfl</span> サッカー関連のページを検索しますが、NFL という単語を使っていないページを探します。</li>
-  <li>以上です!</li>
-</ul>
-<h3>検索結果</h3>
-各検索結果のページには、以下のリンクが加えられます:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">cached</span>) Nutch が保存しているキャッシュの表示</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explain</span>) このページがこの順位を得た理由の説明</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>) このページへリンクしているページ</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/jp/about.xml
===================================================================
--- src/web/pages/jp/about.xml	(revision 959954)
+++ src/web/pages/jp/about.xml	(working copy)
@@ -1,47 +0,0 @@
-<page>
-
-<title>Nutchについて</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">協力者</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">状態</a></item>
-</menu>
-
-<body>
-
-<p>Nutchは、オープンソース・ウェブ検索エンジンを実装する初めての試みです。
-</p>
-
-<p>ウェブ検索はインターネット・ナビゲーションのために必用な基本的機能です。ところが現在では、ウェブ検索エンジンの数は減少しています。
-今日の寡占状態は、すぐに、ほとんど全てのウェブ検索を提供している一つの会社の商業利用に独占されるかもしれません。
-一社の独占は、多くのインターネットのユーザーにとって良くないことです。
-</p>
-
-<p>Nutch は、商用ウェブ検索エンジンには無い透過性（公開性）をもたらすものです。
-Nutch による検索結果は、充分に偏見が無いことがわかっており、安心して検索をまかせることができます（少なくとも、Nutsh のバイアスは、公共のためのものです）。
-全ての既存の主な検索エンジンは、検索エンジン所有者の実装した公式によって順位が決められます。検索結果として与えられたページの順位付けの理由が説明されるわけではありません。
-その上、いくつかの検索エンジンは、各サイトがインデックス順位に対して支払った金額によって順位を決定する場合さえあります。
-Nutch は各々のユーザーにできる限り最高の検索結果を与えようとするものであり、
-その結果や関係する情報に偏見を加える理由は何もありません。
-</p>
-
-<p>Nutch は、誰でも簡単に、そして低コストで、効率よく国際的なウェブ検索エンジンを設置するのを可能にしようとするものです。
-これは、とても困難な挑戦です。
-Nutch ソフトウェアが成功するためには以下の要件を満たした有能なものでなければなりません：
-</p>
-<ul>
-  <li>一ヶ月ごとに10億ページを取ってくること</li>
-  <li>これらのページのインデックスを維持すること</li>
-  <li>1秒につき最高1000回の検索インデックスを付けること</li>
-  <li>非常に高品質ですばらしい検索結果を提供すること</li>
-  <li>最小のコストで動作すること</li>
-</ul>
-
-<p>これはとても挑戦的な提案です。
-あなたが Nutch プロジェクトの価値を信じるならば、<a
-href="http://www.apache.org/dev/">開発者</a>として、又は<a
-href="http://www.apache.org/foundation/contributing.html">寄付</a>をしてこのプロジェクトを是非とも応援してください。
-</p>
-
-</body>
-</page>
Index: src/web/pages/sh/search.xml
===================================================================
--- src/web/pages/sh/search.xml	(revision 959954)
+++ src/web/pages/sh/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="sh"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Pretraži"/>
-  <a href="help.html">pomoć</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/index.html">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/sh/help.xml
===================================================================
--- src/web/pages/sh/help.xml	(revision 959954)
+++ src/web/pages/sh/help.xml	(working copy)
@@ -1,37 +0,0 @@
-<page>
-
-<title>pomoć</title>
-
-<body>
-
-<h3>Upiti</h3>
-Da biste pretraživali pomoću Nutch-a, samo ukucajte nekoliko reči.
-<ul>
-  <li>Rezultati predstavljaju samo one stranice koje sadrže <span
- style="font-style: italic;">sve</span> reči iz upita.</li>
-  <li>Koristite navodnike oko reči koje se moraju pojavljivati zajedno, kao fraza,
-npr., <span style="font-weight: bold;">"Novi Sad"</span>.</li>
-  <li>Interpunkcija između reči takođe izaziva pretraživanje po frazama.  Tako da je 
-pretraga po <span style="font-weight: bold;">http://www.apache.org/</span>
-identična kao pretraga po <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Pretraživanje ne razlikuje velika i mala slova u upitu, tako da je pretraga po <span
- style="font-weight: bold;">NuTcH</span> identična kao pretraga po <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Da biste isključili iz rezultata stranice kojе sadrže neki termin stavite znak minus ispred tog termina,
- npr., pretraživanje po <span style="font-weight: bold;">fudbal
--nfl</span> će pronaći stranice  koje za temu imaju fudbal, ali ne koriste reč "nfl".</li>
-  <li>To je to!</li>
-</ul>
-<h3>Rezultati</h3>
-Svaka stranica u rezultatima ima sledeće linkove:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">keširano</span>) prikazuje verziju stranice
-koju je Nutch dohvatio.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">objasni</span>) prikazuje objašnjenje 
-  kako je ova stranica bodovana.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">veze</span>) prikazuje spisak odrednica korišćenih da se napravi link ka ovoj strani.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/sh/about.xml
===================================================================
--- src/web/pages/sh/about.xml	(revision 959954)
+++ src/web/pages/sh/about.xml	(working copy)
@@ -1,46 +0,0 @@
-<page>
-
-<title>O Nutch-u</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Zasluge</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch je poduhvat implementiranja web pretraživača otvorenog koda.</p>
-
-<p>Web pretraživanje je osnovni preduslov za internet navigaciju, a ipak se
-broj web pretraživača smanjuje. Oligopol današnjice mogao bi uskoro lako  
-postati monopol, sa jednom kompanijom koja kotroliše gotovo sva pretraživanja 
-radi sopstvene finansijske dobiti. Po korisnike interneta to ne bi bilo dobro.</p>
-
-<p>Nutch nudi transparentnu alternativu komercijalnim web pretraživačima.
-Samo pretraživačima otvorenog koda možemo verovati da su njihovi rezultati nepristrasni.
-(Ili je barem njihova pristrasnost javna.) Svi glavni postojeći pretraživači 
-imaju sopstvene formule rangiranja rezultata, i ne žele objašnjavati zašto se određena stranica 
-rangira baš tako. Takođe, nekim web pretraživačima moguće je platiti kako bi se uticalo na to 
-koji sajt i na koji način će biti rangiran u rezultatima, umesto da uslov za rangiranje bude sam sajt.
-Nutch, sa druge strane, nema šta da krije i nema motiva da njegovi rezultati ili njegov crawler bude pristrasan u bilo kom
-pogledu osim da pokuša da svakom korisniku ponudi najbolje moguće rezultate.</p>
-
-<p>Cilj Nutch-a je omogući svakome da jednostavno i ekonomično razvije 
-kvalitetan web pretraživač.  To je veliki izazov.  Da bi uspeo, Nutch 
-softver mora biti sposoban da:</p>
-<ul>
-  <li>dohvata nekoliko milijardi stranica mesečno</li>
-  <li>održava indeks tih stranica</li>
-  <li>pretražuje indeks tih stranica i do 1000 puta u sekundi</li>
-  <li>obezbedi visokokvalitetne rezulate pretrage</li>
-  <li>ima minimalne operativne troškove</li>
-</ul>
-
-<p>To je izazovan poduhvat. Ako veruješ u suštinu ovog projekta, možeš pomoći
-bilo svojim radom na <a
-href="http://www.apache.org/dev/">razvoju</a> bilo <a
-href="http://www.apache.org/foundation/contributing.html">donacijom.</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/th/search.xml
===================================================================
--- src/web/pages/th/search.xml	(revision 959954)
+++ src/web/pages/th/search.xml	(working copy)
@@ -1,10 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get"> <input
- name="query" size="44"/>&#160;<input type="submit" value="ค้นหา"/>
-<a href="help.html">คำแนะนำการใช้</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/th/help.xml
===================================================================
--- src/web/pages/th/help.xml	(revision 959954)
+++ src/web/pages/th/help.xml	(working copy)
@@ -1,34 +0,0 @@
-<page>
-
-<title>คำแนะนำในการค้นหา</title>
-
-<body>
-
-<h3>คำที่จะค้นหา</h3>
-เพียงพิมพ์คำไม่กี่คำ คุณก็สามารถใช้นัทช์ค้นหาได้
-<ul>
-  <li>ผลลัพธ์ที่ได้คือหน้าที่มีคำที่ใช้ค้นหา
-	<span style="font-style: italic;">ทั้งหมด</span></li>
-  <li>ใช้อัญประกาศรอบคำที่จะต้องอยู่ใกล้กันเป็นพยางค์
-เช่น <span style="font-weight: bold;">"New Zealand"</span></li>
-  <li>เครื่องหมายต่างๆระหว่างคำก็จะนำไปสู่การค้นหาเป็นพยางค์ เช่นถ้าจะหา
-	<span style="font-weight: bold;">http://www.apache.org/</span>
-ก็เหมือนกับหา<span style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>อักษรตัวใหญ่ตัวเล็กไม่มีผลในการค้นหา เช่น<span
- style="font-weight: bold;">NuTcH</span> เหมือนกับหา<span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>คุณสามารถห้ามไม่ให้ผลลัพธ์มีหน้าที่มีคำที่คุณไม่ต้องการ โดยการใส่เครื่องหมายลบหน้าคำนั้น เช่นหา
-<span style="font-weight: bold;">football
--nfl</span>จะพบหน้าที่พูดถึงเรื่องฟุตบอล แต่ไม่มีคำว่า nfl</li>
-  <li>ก็เท่านั้นเอง!</li>
-</ul>
-<h3>ผลลัพธ์</h3>
-หน้าที่พบแต่ละหน้าจะมีลิงค์ดังต่อไปนี้:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">หน้าที่เก็บไว้</span>)แสดงหน้าที่นัทช์ได้นำมาเก็บไว้</li>
-  <li>(<span style="color: rgb(51, 51, 255);">คำอธิบาย</span>)แสดงคำอธิบายถึงการให้คะแนนของหน้านี้</li>
-  <li>(<span style="color: rgb(51, 51, 255);">แองเคอร์</span>)แสดงรายการของข้อความที่โยงมายังหน้านี้</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/th/about.xml
===================================================================
--- src/web/pages/th/about.xml	(revision 959954)
+++ src/web/pages/th/about.xml	(working copy)
@@ -1,51 +0,0 @@
-<page>
-
-<title>about</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">ผู้ร่วมงาน</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">สถานภาพ</a></item>
-</menu>
-
-<body>
-
-<p>นัทช์ เป็นความพยายามที่ก่อตัวขึ้นเพื่อพัฒนาระบบค้นหาโอเพนซอร์ส</p>
-
-<p>การค้นหาบนเว็บเป็นสิ่งจำเป็นขั้นพื้นฐานในการท่องอินเตอร์เนต
-แต่จำนวนของระบบค้นหากำลังลดลงเรื่อยๆ
-การครองตลาดของผู้ให้บริการไม่กี่รายในวันนี้อาจทำให้เกิดการผูกขาดในไม่ช้า
-ถึงเวลานั้นจะมีเพียงบริษัทเดียวที่ควบคุมระบบค้นหาบนเว็บเพื่อผลประโยชน์ทางธุรกิจ
-นั่นย่อมเป็นสิ่งที่ไม่ดีสำหรับผู้ใช้อินเตอร์เนต
-</p>
-
-<p>นัทช์เป็นทางเลือกที่โปร่งใสแทนระบบค้นหาเพื่อการค้า
-ผลการค้นหาที่ได้มาจากโอเพนซอร์สเท่านั้นที่จะเชื่อถือได้ว่าปราศจากอคติ
-(หรือถ้ามีอคติก็เป็นที่ทราบต่อสาธารณชน)
-ระบบค้นหาบนเว็บที่มีอยู่ในขณะนี้มีสูตรการจัดอันดับที่เป็นความลับ
-และไม่มีคำอธิบายใดๆว่าทำไมเว็บแต่ละหน้าถึงได้อันดับที่ได้อยู่
-นอกจากนี้แล้ว บางระบบค้นหายังตัดสินว่าจะทำดรรชนีไซต์ไหนตามเงินที่ได้รับ
-ไม่ใช่ตามคุณภาพของไซท์นั้น
-ในทางตรงข้าม นัทช์ไม่มีสิ่งใดแอบแฝงและไม่มีแรงจูงใจใดๆที่จะมีอคติในผลการค้นหา
-หรือในการรวบรวมเว็บไซต์
-นอกจากจะพยายามที่จะนำเสนอผลการค้นหาที่ดีที่สุดเท่าที่เป็นไปได้ต่อผู้ใช้</p>
-
-<p>นัทช์มีเป้าหมายที่จะทำให้คนทั่วไปสามารถนำระบบค้นหาระดับโลกไปใช้งานอย่างง่ายดาย
-และได้ผลตอบแทนที่คุ้มค่า
-นี่เป็นเป้าหมายที่ท้าทายเป็นอย่างยิ่ง
-เพื่อที่จะบรรลุเป้าหมาย นัทช์จะต้องสามารถ:</p>
-<ul>
-  <li>รวบรวมเว็บเพจได้หลายพันล้านหน้าต่อเดือน</li>
-  <li>ดูแลดรรชนีของเว็บเพจเหล่านี้</li>
-  <li>ค้นหาดรรชนีได้ถึง 1000 ครั้งต่อวินาที</li>
-  <li>ให้ผลการค้นหาที่มีคุณภาพสูง</li>
-  <li>ใช้ค่าใช้จ่ายต่ำในการดำเนินงาน</li>
-</ul>
-
-<p>นี่เป็นเป้าหมายที่ท้าทาย ถ้าคุณเชื่อว่าโครงการนี้มีประโยชน์
-กรุณาให้ความร่วมมือโดยเป็น<a
-href="http://www.apache.org/dev/">ผู้พัฒนา</a>หรือร่วม<a
-href="http://www.apache.org/foundation/contributing.html">บริจาค</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/pl/search.xml
===================================================================
--- src/web/pages/pl/search.xml	(revision 959954)
+++ src/web/pages/pl/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="pl"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Szukaj"/>
-  <a href="help.html">pomoc</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/index.html">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/pl/help.xml
===================================================================
--- src/web/pages/pl/help.xml	(revision 959954)
+++ src/web/pages/pl/help.xml	(working copy)
@@ -1,39 +0,0 @@
-<page>
-
-<title>pomoc wyszukiwania</title>
-
-<body>
-
-<h3>Zapytania</h3>
-W celu wyszukiwania przy użyciu Nutch, po prostu wpisz kilka słów.
-<ul>
-  <li>Wyniki zawierają wyłącznie strony, które posiadają <span
- style="font-style: italic;">wszystkie</span> słowa zapytania.</li>
-  <li>Zamykaj kilka słów w cudzysłowach, jeśli muszą one występować razem jako
-  fraza, n.p. <span style="font-weight: bold;">"Nowa Zelandia"</span></li>
-  <li>Znaki interpunkcyjne pomiędzy słowami również powodują traktowanie
-  słów jako frazy. Zatem zapytanie
-  <span style="font-weight: bold;">http://www.apache.org/</span> jest
-  równoznaczne z zapytaniem <span style="font-weight: bold;">"http www
-apache org"</span>.</li>
-  <li>Wyszukiwanie nie jest wrażliwe na wielkość liter, więc zapytanie
-  <span style="font-weight: bold;">NuTcH</span> jest równoważne zapytaniu <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Możesz zabronić występowania pewnego terminu przez umieszczenie minusa
-  przed danym słowem, tzn. zapytanie <span style="font-weight: bold;">football
--nfl</span> odnajdzie strony, które mówią o futbolu, ale nie używają słowa "NFL".</li>
-  <li>To wszystko!</li>
-</ul>
-<h3>Wyniki</h3>
-Każda pasująca strona na liście wyników posiada następujące odnośniki:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">kopia</span>) pokazuje tę
-  wersję strony, którą Nutch ściągnął do indeksowania.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">wyjaśnij</span>) pokazuje
-  wyjaśnienie dlaczego ta strona otrzymała ten ranking.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">odnośniki</span>) pokazuje listę
-  odnośników w indeksie wskazujących na tę stronę.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/pl/about.xml
===================================================================
--- src/web/pages/pl/about.xml	(revision 959954)
+++ src/web/pages/pl/about.xml	(working copy)
@@ -1,50 +0,0 @@
-<page>
-
-<title>O Projekcie</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Podziękowania</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Projekt Nutch jest niedawno powstałą inicjatywą zmierzającą do zaimplementowania
-serwera wyszukiwania stron WWW, na licencji Open Source.</p>
-
-<p>Wyszukiwarka sieciowa to podstawowe narzędzie do celów nawigacji
-internetowej - a jednak liczba dostępnych wyszukiwarek maleje. Obecna oligarchia
-kilku wiodących wyszukiwarek może wkrótce przerodzić się w monopol, w którym
-jedna firma będzie niemal całkiem kontrolowała wyszukiwanie dla swoich
-komercyjnych celów. Nie byłoby to dobre dla użytkowników Internetu.</p>
-
-<p>Nutch oferuje przejrzystą alternatywę dla komercyjnych wyszukiwarek.
-Tylko rezultatom szukania opartym o technologię Open Source można w pełni zaufać,
-że nie są stronnicze (a przynajmniej ich stronniczość jest jawna). Wszystkie
-główne wyszukiwarki stosują niejawne metody sortowania rezultatów, i nie
-wyjaśniają dlaczego dana strona uzyskała taki wynik a nie inny. Co więcej,
-niektóre wyszukiwarki decydują o tym, które strony umieścić w swoim indeksie,
-na podstawie opłat a nie rzeczywistej wartości użytkowej stron. W
-przeciwieństwie do nich Nutch nie ma nic do ukrycia, i nie ma motywacji żeby
-uzależniać rezultaty wyszukiwania lub indeksowania od czegokolwiek innego niż
- chęci dostarczenia użytkownikom jak najbardziej wiarygodnych rezultatów.</p>
-
-<p>Projekt Nutch pragnie umożliwić każdemu łatwą i efektywną instalację
-wyszukiwarki sieciowej światowej klasy. Jest to niemałe wyzwanie. W celu
-osiągnięcia sukcesu, oprogramowanie Nutch musi być w stanie:</p>
-<ul>
-  <li>pobrać kilka miliardów stron na miesiąc</li>
-  <li>zarządzać indeksem tych stron</li>
-  <li>przeszukiwać indeks nawet 1000 razy na sekundę</li>
-  <li>zapewniać wysoką jakość wyników wyszukiwania</li>
-  <li>działać przy zachowaniu minimum kosztów</li>
-</ul>
-
-<p>Takie cele są prawdziwym wyzwaniem. Jeśli jesteś przekonany o
-zaletach tego projektu, to możesz nam pomóc jako <a
-href="http://www.apache.org/dev/">deweloper</a> lub poprzez <a
-href="http://www.apache.org/foundation/contributing.html">darowizny</a>.
-</p>
-
-</body>
-</page>
Index: src/web/pages/it/search.xml
===================================================================
--- src/web/pages/it/search.xml	(revision 959954)
+++ src/web/pages/it/search.xml	(working copy)
@@ -1,16 +0,0 @@
-<page>
-<body>
-<center>
-<form name="cerca" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="it"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Cerca"/>
-  <a href="help.html">aiuto</a>
-</form>
-</center>
-<div style="text-align: right;">
-<a href="http://lucene.apache.org/nutch/">
-<img border="0" src="../img/poweredbynutch_01.gif"/>
-</a>
-</div>
-</body>
-</page>
Index: src/web/pages/it/help.xml
===================================================================
--- src/web/pages/it/help.xml	(revision 959954)
+++ src/web/pages/it/help.xml	(working copy)
@@ -1,37 +0,0 @@
-<page>
-
-<title>Nutch: aiuto sulle ricerche</title>
-
-<body>
-
-<h3>Ricerche</h3>
-Per cercare con Nutch, digita solo alcune parole.
-<ul>
-  <li>I risultai includono solo pagine che contengono <span
- style="font-style: italic;">tutte</span> le parole della ricerca.</li>
-  <li>Usa le virgolette attorno alle parole che devono essere adiacenti,
- come una frase, e.g., <span style="font-weight: bold;">"New Zealand"</span>.</li>
-  <li>Anche l'uso della punteggiatura tra le parole attiva la ricerca per frase. In
- questo modo cercare<span style="font-weight: bold;">http://www.apache.org/</span>
- equivale a cercare <span style="font-weight: bold;">"http www apache org"</span>.</li>
-  <li>Le ricerche non tengono conto di lettere maiuscole e minuscole, quindi cercare <span
- style="font-weight: bold;">NuTcH</span> è come cercare <span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Puoi escludere un termine dalla ricerca inserendo un meno prima di esso, e.g., 
- la ricerca <span style="font-weight: bold;">football-nfl</span> 
- troverà tutte le pagine che parlano di football, ma non usano la parola "nfl".</li>
-  <li>Tutto qui!</li>
-</ul>
-<h3>Risultati</h3>
-Ogni pagina trovata ha i seguenti links:
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">cached</span>) mostra la versione della
- pagina scaricata da Nutch.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">explain</span>) mostra una spiegazione
- sul punteggio ottenuto dalla pagina.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">anchors</span>) mostra una lista di links
- che riferiscono la pagina che è stata indicizzata.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/it/about.xml
===================================================================
--- src/web/pages/it/about.xml	(revision 959954)
+++ src/web/pages/it/about.xml	(working copy)
@@ -1,51 +0,0 @@
-<page>
-
-<title>A proposito</title>
-
-<menu>
- <item><a href="org.html">Organization</a></item>
- <item><a href="credits.html">Credits</a></item>
- <item><a href="press.html">Press</a></item>
- <item><a href="status.html">Status</a></item>
-</menu>
-
-<body>
-
-<p>Nutch è un progetto che ha come obbiettivo implementare un motore
-di ricerca open-source.</p>
-
-<p>La ricerca sul Web ha un requisito fondamentale per la navigazione
-internet, nonostante ciò il numero dei motori di ricerca sta diminuendo.
-La oligopolia di oggi potrebbe diventare presto un monopolio, con un'uni-
-ca compagnia che controlla quasi tutte le ricerche web per il suo
-profitto commerciale.</p>
-
-<p>Nutch fornisce un'alternativa trasparente ai motori di ricerca commerciali.
-Solo con i risultati di ricerca open-source si può essere sicuri che non ci 
-sia stata nessuna forzatura.(O almeno la loro forzatura è pubblica).
-Tutti i maggiori motori di ricerca commerciali esistenti hanno le loro formule
-proprietarie per i punteggi, e non spiegheranno mai perchè una pagina ottiene
-un certo punteggio. Inoltre, alcuni motori di ricerca determinano quale sito
-indicizzare basandosi sui pagamenti, piuttosto che sui meriti dei siti stessi.
-Nutch, d'altra parte, non ha niente da nascondere e nessun motivo di forzare
-i suoi risultati o il suo crawler se non per dare ad ogni utente i migliori
-risultati possibili.</p>
-
-<p>L'obiettivo di nutch è quello di permettere a chiunque di utilizzare un 
-motore di ricerca mondiale facile ed economico. Ciò è una sfida notevole.
-Per avere successo,il software nutch deve poter:</p>
-<ul>
-  <li>recupera miliardi di pagine al mese</li>
-  <li>mantenere un indice di queste pagine</li>
-  <li>cercare in questo indice fino a 1000 volte al secondo</li>
-  <li>fornire risultati di altissima qualità</li>
-  <li>operare al minimo costo</li>
-</ul>
-
-<p>Questa è una proposta stimolante. Se credi nei meriti di questo progetto,
-per favore dai un contributo, sia come<a href="developers.html">sviluppatore</a>
-o con una <a href="donate.html">donazione</a>
-</p>
-
-</body>
-</page>
Index: src/web/pages/hu/search.xml
===================================================================
--- src/web/pages/hu/search.xml	(revision 959954)
+++ src/web/pages/hu/search.xml	(working copy)
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-<body>
-<center>
-<form name="search" action="../search.jsp" method="get">
-  <input type="hidden" name="lang" value="hu"/>
-  <input name="query" size="44"/>&#160;<input type="submit" value="Keress"/>
-  <a href="help.html">segtsg</a>
-</form>
-</center>
-</body>
-</page>
Index: src/web/pages/hu/help.xml
===================================================================
--- src/web/pages/hu/help.xml	(revision 959954)
+++ src/web/pages/hu/help.xml	(working copy)
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-
-<title>Segtsg</title>
-
-<body>
-
-<h3>Keress</h3>
-A keress megkezdshez rj be nhny szt.
-<ul>
-  <li>A tallatok kztt csak azok az oldalak fognak szerepelni, amelyeken <span
- style="font-style: italic;">az sszes</span> keresend sz megtallhat.</li>
-  <li>Idzjelek hasznlatval pontos kifejezsre kereshetsz,
-	pl. <span style="font-weight: bold;">"Soltvadkerti bor"</span>.</li>
-  <li>A szavak kztti pontok hatsra az egyes szvakra keres r a Nutch. 
-  	Teht a <span style="font-weight: bold;">http://www.apache.org/</span>-ra 
-	val keress ugyan az, mintha <span style="font-weight: bold;">"http www
-	apache org"</span>-ra keresnl.</li>
-  <li>A keress nem rzkeny a kis- s nagybetkre, teht a <span
- style="font-weight: bold;">NuTcH</span> itt ugyanaz, mint a<span
- style="font-weight: bold;">nUtCh</span>.</li>
-  <li>Megtilthatod egyes szavak szereplst a tallatok kztt a minusz jel segtsgvel ,pl.
-  <span style="font-weight: bold;">foci -nb1</span> minden oldalt megtall, ami focival foglalkozik,
-  	de nem szerepel rajta az "nb1".</li>
-  <li>Ennyi!</li>
-</ul>
-<h3>Tallatok</h3>
-A tallatok listjban minden :
-<ul>
-  <li>(<span style="color: rgb(51, 51, 255);">trolt</span>) megmutatja
-  	az oldalnak azt a verzijt, amit a Nutch letrolt..</li>
-  <li>(<span style="color: rgb(51, 51, 255);">magyarzat</span>) megmutatja,
-  	hogy mi alapjn rangsorolta az oldalt a Nutch.</li>
-  <li>(<span style="color: rgb(51, 51, 255);">kapcsolatok</span>) megmutatja
-  	az oldal kapcsold lapjait.</li>
-</ul>
-
-</body>
-</page>
Index: src/web/pages/hu/about.xml
===================================================================
--- src/web/pages/hu/about.xml	(revision 959954)
+++ src/web/pages/hu/about.xml	(working copy)
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<page>
-
-<title>nvjegy</title>
-
-<menu>
- <item><a href="http://lucene.apache.org/nutch/credits.html">Rlunk</a></item>
- <item><a href="http://lucene.apache.org/nutch/index.html">Sttusz</a></item>
-</menu>
-
-<body>
-
-<p>A Nutch egy most indul, nylt forrs, webes keresmotor.</p>
-
-<p>Br a webes keress alapveten fontos az interneten val naviglsban, mgis, a
-keresk szma egyre cskken. A ma ltez tbbszerepls keres-piac helyt hamarosan tveheti
-az anyagi alapon mkd egyszemlyes, monopl piac. 
-Ez sem a felhasznlknak, sem az internetnek nem tenne jt.</p>
-
-<p>A Nutch az zleti keresgpekkel szemben knl alternatvt.
-Csak a nylt forrs projektek mentesek minden buktattl. (Vagy legalbbis a
-bennk lv hibk ismertek, publikusak.)  Minden mr meglv keresmotor rangsorolja a tallatokat,
-de nem rjk le, hogy mi alapjn teszik ezt. Radsul, nhny keresmotor nem az oldal alapjn,
-hanem anyagi megfontolsok alapjn indexelik az adott lapot. A Nutch, ezzel ellenttben, semmi
-hasonlt nem tesz, csak a legjobb tallati arnyra sszpontost.</p>
-
-<p>A Nutch clja, hogy brki kpes legyen olcsn nagy teljestmny keresgpet
-zembe lltani. Ez az alapvet cl.
-A sikerhez a Nutch-nak a kvetkezknek kell megfelelnie:</p>
-<ul>
-  <li>t kell tudnia vizsglni tbb millird oldalt havonta</li>
-  <li>indexelni ezeket az oldalakat</li>
-  <li>keresni ezekben az indexekben (1000 tallat msodpercenknt)</li>
-  <li>nagyon jl illeszked tallatokat adni</li>
-  <li>kltsghatkonynak lenni</li>
-</ul>
-
-<p>Ez egy felhvs. Ha a projekt felkeltette az rdekldseset, krlek segts, 
- vagy mint <a href="http://www.apache.org/dev/">fejleszt</a>, vagy mint
- <a href="http://www.apache.org/foundation/contributing.html">tmogat</a>
-</p>
-
-</body>
-</page>
Index: src/web/style/nutch-page.xsl
===================================================================
--- src/web/style/nutch-page.xsl	(revision 959954)
+++ src/web/style/nutch-page.xsl	(working copy)
@@ -1,114 +0,0 @@
-<?xml version="1.0"?>
-<!-- XSLT stylesheet that adds Nutch style, header, and footer
-  elements.  This is used by Ant to generate static html pages. -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
-  <xsl:output method="html" doctype-public="-//W3C//DTD HTML 4.01 Transitional//EN"/>
-  <xsl:template match="page">
-    <html>
-      <xsl:comment>This page is automatically generated.  Do not edit!</xsl:comment>
-      <head>
-<!-- page title -->
-        <title>
-          <xsl:text>Nutch: </xsl:text>
-          <xsl:value-of select="title" disable-output-escaping="yes"/>
-        </title>
-<!-- insert style -->
-        <xsl:copy-of select="document('../include/style.html')"/>
-<!-- specify icon file -->
-      <link rel="icon" href="../img/favicon.ico" type="image/x-icon"/>
-      <link rel="shortcut icon" href="../img/favicon.ico" type="image/x-icon"/>
-
-      <script type="text/javascript">
-      <xsl:comment>
-function queryfocus() {
-  search = document.search;
-  if (search != null) { search.query.focus(); }
-}
-<xsl:text>// </xsl:text>
-</xsl:comment>
-      </script>
-      </head>
-      <body onLoad="queryfocus();">
-<!-- insert localized header -->
-        <xsl:copy-of select="document('include/header.html')"/>
-        <table width="635" border="0" cellpadding="0" cellspacing="0">
-          <tr valign="top">
-            <td width="140">
-              <xsl:call-template name="subnavi"/>
-            </td>
-            <td width="20" background="../img/reiter/_spacer_cccccc.gif">
-              <xsl:text disable-output-escaping="yes">&amp;#160;</xsl:text>
-            </td>
-            <td width="475" class="body">
-              <xsl:call-template name="body"/>
-            </td>
-          </tr>
-        </table>
-<!-- insert nutch footer -->
-        <xsl:copy-of select="document('../include/footer.html')"/>
-      </body>
-    </html>
-  </xsl:template>
-<!-- included menu -->
-  <xsl:template name="subnavi">
-    <table width="100%" cellpadding="0" cellspacing="0">
-      <xsl:for-each select="menu/item">
-        <xsl:if test="not(.='')">
-          <tr class="menuTd" height="25">
-            <td class="menuTd" onmouseover="this.className='menuTdhover';" onmouseout="this.className='menuTd'" width="100%">
-              <xsl:text disable-output-escaping="yes">&amp;#160;:: </xsl:text>
-              <xsl:variable name="url" select="a/@href"/>
-              <a href="{$url}" class="menuEntry">
-                <xsl:value-of select="."/>
-              </a>
-            </td>
-          </tr>
-          <tr height="1px">
-            <td>
-              <img src="../img/reiter/spacer_666666.gif" height="1" width="100%"/>
-            </td>
-          </tr>
-        </xsl:if>
-      </xsl:for-each>
-      <tr>
-        <td>
-          <xsl:text disable-output-escaping="yes">&amp;#160;</xsl:text>
-        </td>
-      </tr>
-    </table>
-  </xsl:template>
-<!-- /included menu -->
-<!-- included body -->
-  <xsl:template name="body">
-    <table width="475" border="0" cellpadding="0" cellspacing="0">
-      <tr>
-        <td class="title" height="125" width="275" valign="bottom">
-          <xsl:value-of select="title" disable-output-escaping="yes"/>
-        </td>
-        <td height="125" width="200" valign="bottom">
-          <img src="../img/reiter/robots.gif"/>
-        </td>
-      </tr>
-    </table>
-    <br class="br"/>
-    <xsl:for-each select="body/node()">
-      <xsl:choose>
-<!-- orange intro -->
-        <xsl:when test="name()='p' and position() &lt; 3">
-          <span class="intro">
-            <xsl:copy-of select="."/>
-          </span>
-        </xsl:when>
-<!-- all other text -->
-        <xsl:otherwise>
-          <span class="bodytext">
-            <xsl:copy-of select="."/>
-          </span>
-        </xsl:otherwise>
-      </xsl:choose>
-    </xsl:for-each>
-    <br class="br"/>
-    <br class="br"/>
-  </xsl:template>
-<!-- /included body -->
-</xsl:stylesheet>
Index: src/web/style/nutch-header.xsl
===================================================================
--- src/web/style/nutch-header.xsl	(revision 959954)
+++ src/web/style/nutch-header.xsl	(working copy)
@@ -1,49 +0,0 @@
-<?xml version="1.0"?>
-<!-- XSLT stylesheet that generates localized versions of the page header. -->
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
-  <xsl:template match="header-menu">
-    <xsl:comment>This file is automatically generated.  Do not edit!</xsl:comment>
-    <table width="635" border="0" cellpadding="0" cellspacing="0">
-      <tr>
-        <td valign="bottom" width="140" rowspan="2">
-	  <a href="./"><img src="../img/reiter/logo_nutch.gif" border="0"/></a>
-          <img src="../img/reiter/spacer_666666.gif" width="140" height="1"/>
-        </td>
-      </tr>
-      <tr>
-        <td width="495" valign="bottom" align="right">
-          <table border="0" cellpadding="0" cellspacing="0" width="495">
-            <tr>
-              <td background="../img/reiter/_bg_reiter.gif" width="400">
-                <xsl:text disable-output-escaping="yes">&amp;#160;</xsl:text>
-              </td>
-<!-- menu -->
-              <xsl:for-each select="item">
-                <td height="28" valign="bottom" width="10">
-                  <xsl:choose>
-                    <xsl:when test="position()=1">
-                      <img src="../img/reiter/reiter_inactive_le1.gif" border="0"/>
-                    </xsl:when>
-                    <xsl:otherwise>
-                      <img src="../img/reiter/reiter_inactive_le.gif" border="0"/>
-                    </xsl:otherwise>
-                  </xsl:choose>
-                </td>
-                <td background="../img/reiter/_bg_reiter_inactive.gif" valign="bottom" nowrap="nowrap">
-                  <xsl:variable name="url" select="a/@href"/>
-                  <a class="bodytext" href="{$url}">
-                    <xsl:value-of select="." disable-output-escaping="yes"/>
-                  </a>
-                </td>
-                <td height="28" valign="bottom" width="10">
-                  <img src="../img/reiter/reiter_inactive_ri.gif" border="0"/>
-                </td>
-              </xsl:for-each>
-<!-- menue -->
-            </tr>
-          </table>
-        </td>
-      </tr>
-    </table>
-  </xsl:template>
-</xsl:stylesheet>
Index: src/web/include/ms/header.xml
===================================================================
--- src/web/include/ms/header.xml	(revision 959954)
+++ src/web/include/ms/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">Perihal</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Soalan-Soalan Lazim</a></item>
-</header-menu>
Index: src/web/include/zh/header.xml
===================================================================
--- src/web/include/zh/header.xml	(revision 959954)
+++ src/web/include/zh/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">简介</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">常见问题</a></item>
-</header-menu>
Index: src/web/include/pt/header.xml
===================================================================
--- src/web/include/pt/header.xml	(revision 959954)
+++ src/web/include/pt/header.xml	(working copy)
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-
-<header-menu>
- <item><a href="about.html">Sobre</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Perguntas Mais Freqntes</a></item>
-</header-menu>
Index: src/web/include/ca/header.xml
===================================================================
--- src/web/include/ca/header.xml	(revision 959954)
+++ src/web/include/ca/header.xml	(working copy)
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-
-<header-menu>
- <item><a href="about.html">Quan a</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">FAQ</a></item>
-</header-menu>
-
Index: src/web/include/sr/header.xml
===================================================================
--- src/web/include/sr/header.xml	(revision 959954)
+++ src/web/include/sr/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">О Nutch-у</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Најчешћа питања</a></item>
-</header-menu>
Index: src/web/include/de/header.xml
===================================================================
--- src/web/include/de/header.xml	(revision 959954)
+++ src/web/include/de/header.xml	(working copy)
@@ -1,6 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<header-menu>
- <item><a href="about.html">&Uuml;ber</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">FAQ</a></item>
-</header-menu>
Index: src/web/include/sv/header.xml
===================================================================
--- src/web/include/sv/header.xml	(revision 959954)
+++ src/web/include/sv/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">Om</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Vanliga frågor (FAQ)</a></item>
-</header-menu>
Index: src/web/include/fi/header.xml
===================================================================
--- src/web/include/fi/header.xml	(revision 959954)
+++ src/web/include/fi/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">Projektista</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">FAQ</a></item>
-</header-menu>
Index: src/web/include/footer.html
===================================================================
--- src/web/include/footer.html	(revision 959954)
+++ src/web/include/footer.html	(working copy)
@@ -1,31 +0,0 @@
-<table width="635" border="0" cellpadding="0" cellspacing="0">
-  <tr>
-    <td width="140">&#160;</td>
-    <td width="20">&#160;</td>
-    <td width="475" align="center">
-    <span class="bodytext">
-    <br/>
-      <a href="../ca/">ca</a> |
-      <a href="../de/">de</a> |
-      <a href="../en/">en</a> |
-      <a href="../es/">es</a> |
-      <a href="../fi/">fi</a> |
-      <a href="../fr/">fr</a> |
-      <a href="../hu/">hu</a> |
-      <a href="../it/">it</a> |
-      <a href="../jp/">jp</a> |
-      <a href="../ms/">ms</a> |
-      <a href="../nl/">nl</a> |
-      <a href="../pl/">pl</a> |
-      <a href="../pt/">pt</a> |
-      <a href="../sh/">sh</a> |
-      <a href="../sr/">sr</a> |
-      <a href="../sv/">sv</a> |
-      <a href="../th/">th</a> |
-      <a href="../zh/">zh</a>
-    </span>
-    </td>
-  </tr>
-  <tr>
-  </tr>
-</table>
Index: src/web/include/en/header.xml
===================================================================
--- src/web/include/en/header.xml	(revision 959954)
+++ src/web/include/en/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">About</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">FAQ</a></item>
-</header-menu>
Index: src/web/include/style.html
===================================================================
--- src/web/include/style.html	(revision 959954)
+++ src/web/include/style.html	(working copy)
@@ -1,77 +0,0 @@
-<style type="text/css">
-
-.menuTd {
-  background-color: #F9F7F4;
-  height: 25px;
-}
-
-.menuTdhover {
-  background-color: #ECE5DC;
-  height: 25px;
-}
-
-.menuEntry {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 12px;
-  color: #000000;
-  text-decoration: none;
-}
-
-.body {
-  background-color: #F9F7F4;
-}
-
-.bodytext {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 12px;
-  color: #000000;
-  text-decoration: none;
-}
-
-.title {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 26px;
-  color: #FF9900;
-  text-decoration: none;
-}
-
-.intro {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 12px;
-  color: #FF9900;
-  text-decoration: none;
-}
-
-.orangeTd {
-  background-color: #FF9900
-}
-
-ul {
-  list-style-image: url(../img/reiter/ul.gif)
-}
-
-h3 {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 16px;
-  color: #000000;
-}
-
-h4 {
-  font-family: Arial, Helvetica, sans-serif;
-  font-size: 14px;
-  color: #000000;
-}
-
-.url {
-  color: #996600;
-}
-
-.highlight {
-  font-weight: bold;
-}
-
-.ellipsis {
-  font-weight: bold;
-}
-
-</style>
Index: src/web/include/fr/header.xml
===================================================================
--- src/web/include/fr/header.xml	(revision 959954)
+++ src/web/include/fr/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">A propos</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Questions fréquentes</a></item>
-</header-menu>
Index: src/web/include/es/header.xml
===================================================================
--- src/web/include/es/header.xml	(revision 959954)
+++ src/web/include/es/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">Acerca de</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Preguntas frecuentes</a></item>
-</header-menu>
Index: src/web/include/nl/header.xml
===================================================================
--- src/web/include/nl/header.xml	(revision 959954)
+++ src/web/include/nl/header.xml	(working copy)
@@ -1,6 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<header-menu>
- <item><a href="about.html">Over</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Veelgestelde vragen (FAQ)</a></item>
-</header-menu>
Index: src/web/include/jp/header.xml
===================================================================
--- src/web/include/jp/header.xml	(revision 959954)
+++ src/web/include/jp/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">Nutchについて</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">よくある質問</a></item>
-</header-menu>
Index: src/web/include/sh/header.xml
===================================================================
--- src/web/include/sh/header.xml	(revision 959954)
+++ src/web/include/sh/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">O Nutch-u</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Najčešća pitanja</a></item>
-</header-menu>
\ No newline at end of file
Index: src/web/include/pl/header.xml
===================================================================
--- src/web/include/pl/header.xml	(revision 959954)
+++ src/web/include/pl/header.xml	(working copy)
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<header-menu>
-	<item><a href="about.html">O Nutch</a></item>
-	<item><a href="http://wiki.apache.org/nutch/FAQ">Częste pytania</a></item>
-</header-menu>
Index: src/web/include/th/header.xml
===================================================================
--- src/web/include/th/header.xml	(revision 959954)
+++ src/web/include/th/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">เกี่ยวกับ</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">คำถามที่ถามบ่อย</a></item>
-</header-menu>
Index: src/web/include/it/header.xml
===================================================================
--- src/web/include/it/header.xml	(revision 959954)
+++ src/web/include/it/header.xml	(working copy)
@@ -1,4 +0,0 @@
-<header-menu>
- <item><a href="about.html">A proposito</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">Domande frequenti</a></item>
-</header-menu>
Index: src/web/include/hu/header.xml
===================================================================
--- src/web/include/hu/header.xml	(revision 959954)
+++ src/web/include/hu/header.xml	(working copy)
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-<header-menu>
- <item><a href="about.html">A Nutch-rl</a></item>
- <item><a href="http://wiki.apache.org/nutch/FAQ">GYIK</a></item>
-</header-menu>
Index: src/web/log4j.properties
===================================================================
--- src/web/log4j.properties	(revision 959954)
+++ src/web/log4j.properties	(working copy)
@@ -1,7 +0,0 @@
-# log4j configuration used by the front-end container
-
-log4j.rootLogger=info,stdout
-log4j.threshhold=ALL
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{1} - %m%n
Index: src/web/jsp/text.jsp
===================================================================
--- src/web/jsp/text.jsp	(revision 959954)
+++ src/web/jsp/text.jsp	(working copy)
@@ -1,71 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page
-  session="false"
-
-  import="java.io.*"
-  import="java.util.*"
-
-  import="org.apache.nutch.searcher.*"
-  import="org.apache.nutch.parse.ParseText"
-  import="org.apache.hadoop.conf.Configuration"
-  import="org.apache.nutch.util.NutchConfiguration"
-
-%><%
-
-  // show the content of a hit as plain text
-  Configuration nutchConf = NutchConfiguration.get(application);
-  NutchBean bean = NutchBean.get(application, nutchConf);
-
-  bean.LOG.info("text request from " + request.getRemoteAddr());
-
-  Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")));
-  HitDetails details = bean.getDetails(hit);
-
-  String text = bean.getParseText(details).getText();
-  if (text.trim().equals(""))
-    text = null;
-
-  // 20041005, xing
-  // This "CharEncodingForConversion" thing is only pertinent to
-  // html parser (plugin parse-html) in current nutch. None of
-  // other parser plugins are into it. So we worry it later.
-
-%><base href="<%=details.getValue("url")%>">
-<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-<%
-  out.flush();
-%>
-
-<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
-<i18n:bundle baseName="org.nutch.jsp.text"/>
-<h2 style="{color: rgb(255, 153, 0)}"><i18n:message key="title"/></h2>
-
-<i18n:message key="note">
-  <i18n:messageArg value="<%=details.getValue("url")%>"/>
-</i18n:message>
-
-<hr>
-
-<% if (text != null) {%>
-<pre>
-<%= text %>
-</pre>
-<% } else { %>
-<i18n:message key="noText"/>
-<% } %>
Index: src/web/jsp/search.jsp
===================================================================
--- src/web/jsp/search.jsp	(revision 959954)
+++ src/web/jsp/search.jsp	(working copy)
@@ -1,357 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page 
-  session="false"
-  contentType="text/html; charset=UTF-8"
-  pageEncoding="UTF-8"
-
-  import="java.io.*"
-  import="java.util.*"
-  import="java.net.*"
-  import="javax.servlet.http.*"
-  import="javax.servlet.*"
-
-  import="org.apache.nutch.html.Entities"
-  import="org.apache.nutch.metadata.Nutch"
-  import="org.apache.nutch.searcher.*"
-  import="org.apache.nutch.plugin.*"
-  import="org.apache.nutch.clustering.*"
-  import="org.apache.hadoop.conf.*"
-  import="org.apache.nutch.util.NutchConfiguration"
-%><%!
-  /**
-   * Number of hits to retrieve and cluster if clustering extension is available
-   * and clustering is on. By default, 100. Configurable via nutch-conf.xml.
-   */
-  private int HITS_TO_CLUSTER;
-
-  /**
-   * Maximum hits per page to be displayed.
-   */
-  private int MAX_HITS_PER_PAGE;
-
-  /**
-   * An instance of the clustering extension, if available.
-   */
-  private OnlineClusterer clusterer;
-  
-  /**
-   * Nutch configuration for this servlet.
-   */
-  private Configuration nutchConf;
-
-  /**
-   * Initialize search bean.
-   */
-  public void jspInit() {
-    super.jspInit();
-    
-    final ServletContext application = getServletContext(); 
-    nutchConf = NutchConfiguration.get(application);
-	  HITS_TO_CLUSTER = nutchConf.getInt("extension.clustering.hits-to-cluster", 100);
-    MAX_HITS_PER_PAGE = nutchConf.getInt("searcher.max.hits.per.page", -1);
-
-    try {
-      clusterer = new OnlineClustererFactory(nutchConf).getOnlineClusterer();
-    } catch (PluginRuntimeException e) {
-      super.log("Could not initialize online clusterer: " + e.toString());
-    }
-  }
-%>
-
-<%--
-// Uncomment this to enable query refinement.
-// Do the same to "refine-query.jsp" below.,
-<%@ include file="./refine-query-init.jsp" %>
---%>
-
-<%
-  // The Nutch bean instance is initialized through a ServletContextListener 
-  // that is setup in the web.xml file
-  NutchBean bean = NutchBean.get(application, nutchConf);
-  // set the character encoding to use when interpreting request values 
-  request.setCharacterEncoding("UTF-8");
-
-  bean.LOG.info("query request from " + request.getRemoteAddr());
-
-  // get query from request
-  String queryString = request.getParameter("query");
-  if (queryString == null)
-    queryString = "";
-  String htmlQueryString = Entities.encode(queryString);
-  
-  // a flag to make the code cleaner a bit.
-  boolean clusteringAvailable = (clusterer != null);
-
-  String clustering = "";
-  if (clusteringAvailable && "yes".equals(request.getParameter("clustering")))
-    clustering = "yes";
-
-  int start = 0;          // first hit to display
-  String startString = request.getParameter("start");
-  if (startString != null)
-    start = Integer.parseInt(startString);
-
-  int hitsPerPage = 10;          // number of hits to display
-  String hitsString = request.getParameter("hitsPerPage");
-  if (hitsString != null)
-    hitsPerPage = Integer.parseInt(hitsString);
-  if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE)
-    hitsPerPage = MAX_HITS_PER_PAGE;
-
-  int hitsPerSite = 2;                            // max hits per site
-  String hitsPerSiteString = request.getParameter("hitsPerSite");
-  if (hitsPerSiteString != null)
-    hitsPerSite = Integer.parseInt(hitsPerSiteString);
-
-  String sort = request.getParameter("sort");
-  boolean reverse =
-    sort!=null && "true".equals(request.getParameter("reverse"));
-
-  String params = "&hitsPerPage="+hitsPerPage
-     +(sort==null ? "" : "&sort="+sort+(reverse?"&reverse=true":""));
-
-  int hitsToCluster = HITS_TO_CLUSTER;            // number of hits to cluster
-
-  // get the lang from request
-  String queryLang = request.getParameter("lang");
-  if (queryLang == null) { queryLang = ""; }
-  Query query = Query.parse(queryString, queryLang, nutchConf);
-  bean.LOG.info("query: " + queryString);
-  bean.LOG.info("lang: " + queryLang);
-
-  String language =
-    ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
-    .getLocale().getLanguage();
-  String requestURI = HttpUtils.getRequestURL(request).toString();
-  String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-  String rss = "../opensearch?query="+htmlQueryString
-    +"&hitsPerSite="+hitsPerSite+"&lang="+queryLang+params;
-%><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<%
-  // To prevent the character encoding declared with 'contentType' page
-  // directive from being overriden by JSTL (apache i18n), we freeze it
-  // by flushing the output buffer. 
-  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
-  out.flush();
-%>
-<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
-<i18n:bundle baseName="org.nutch.jsp.search"/>
-<html lang="<%= language %>">
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-<head>
-<title>Nutch: <i18n:message key="title"/></title>
-<link rel="icon" href="img/favicon.ico" type="image/x-icon"/>
-<link rel="shortcut icon" href="img/favicon.ico" type="image/x-icon"/>
-<link rel="alternate" type="application/rss+xml" title="RSS" href="<%=rss%>"/>
-<jsp:include page="include/style.html"/>
-<base href="<%= base  + "/" + language %>/">
-<script type="text/javascript">
-<!--
-function queryfocus() { document.search.query.focus(); }
-// -->
-</script>
-</head>
-
-<body onLoad="queryfocus();">
-
-<jsp:include page="<%= language + \"/include/header.html\"%>"/>
-
- <form name="search" action="../search.jsp" method="get">
- <input name="query" size=44 value="<%=htmlQueryString%>">
- <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
- <input type="hidden" name="lang" value="<%=language%>">
- <input type="submit" value="<i18n:message key="search"/>">
- <% if (clusteringAvailable) { %>
-   <input id="clustbox" type="checkbox" name="clustering" value="yes" <% if (clustering.equals("yes")) { %>CHECKED<% } %>>
-    <label for="clustbox"><i18n:message key="clustering"/></label>
- <% } %>
- <a href="help.html">help</a>
- </form>
-
-<%--
-// Uncomment this to enable query refinement.
-// Do the same to "refine-query-init.jsp" above.
-<%@ include file="./refine-query.jsp" %>
---%>
-
-<%
-   // how many hits to retrieve? if clustering is on and available,
-   // take "hitsToCluster", otherwise just get hitsPerPage
-   int hitsToRetrieve = (clusteringAvailable && clustering.equals("yes") ? hitsToCluster : hitsPerPage);
-
-   if (clusteringAvailable && clustering.equals("yes")) {
-     bean.LOG.info("Clustering is on, hits to retrieve: " + hitsToRetrieve);
-   }
-
-   // perform query
-    // NOTE by Dawid Weiss:
-    // The 'clustering' window actually moves with the start
-    // position.... this is good, bad?... ugly?....
-   Hits hits;
-   try{
-      query.getParams().initFrom(start + hitsToRetrieve, hitsPerSite, "site", sort, reverse);
-     hits = bean.search(query);
-   } catch (IOException e){
-     hits = new Hits(0,new Hit[0]);	
-   }
-   int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
-   int length = end-start;
-   int realEnd = (int)Math.min(hits.getLength(), start + hitsToRetrieve);
-
-   Hit[] show = hits.getHits(start, realEnd-start);
-   HitDetails[] details = bean.getDetails(show);
-   Summary[] summaries = bean.getSummary(details, query);
-   bean.LOG.info("total hits: " + hits.getTotal());
-%>
-
-<i18n:message key="hits">
-  <i18n:messageArg value="<%=new Long((end==0)?0:(start+1))%>"/>
-  <i18n:messageArg value="<%=new Long(end)%>"/>
-  <i18n:messageArg value="<%=new Long(hits.getTotal())%>"/>
-</i18n:message>
-
-<%
-// be responsive
-out.flush();
-%>
-
-<br><br>
-
-<% if (clustering.equals("yes") && length != 0) { %>
-<table border=0 cellspacing="3" cellpadding="0">
-
-<tr>
-
-<td valign="top">
-
-<% } %>
-
-<%
-  for (int i = 0; i < length; i++) {      // display the hits
-    Hit hit = show[i];
-    HitDetails detail = details[i];
-    String title = detail.getValue("title");
-    String url = detail.getValue("url");
-    String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
-    String summary = summaries[i].toHtml(true);
-    String caching = detail.getValue("cache");
-    boolean showSummary = true;
-    boolean showCached = true;
-    if (caching != null) {
-      showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL);
-      showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE);
-    }
-
-    if (title == null || title.equals("")) {      // use url for docs w/o title
-      title = url;
-    }
-    %>
-    <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b>
-    <%@ include file="more.jsp" %>
-    <% if (!"".equals(summary) && showSummary) { %>
-    <br><%=summary%>
-    <% } %>
-    <br>
-    <span class="url"><%=Entities.encode(url)%></span>
-    <%
-      if (showCached) {
-        %>(<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>) <%
-    }
-    %>
-    (<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString, "UTF-8")%>&lang=<%=queryLang%>"><i18n:message key="explain"/></a>)
-    (<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
-    <% if (hit.moreFromDupExcluded()) {
-    String more =
-    "query="+URLEncoder.encode("site:"+hit.getDedupValue()+" "+queryString, "UTF8")
-    +params+"&hitsPerSite="+0
-    +"&lang="+queryLang
-    +"&clustering="+clustering;%>
-    (<a href="../search.jsp?<%=more%>"><i18n:message key="moreFrom"/>
-     <%=hit.getDedupValue()%></a>)
-    <% } %>
-    <br><br>
-<% } %>
-
-<% if (clustering.equals("yes") && length != 0) { %>
-
-</td>
-
-<!-- clusters -->
-<td style="border-right: 1px dotted gray;" />&#160;</td>
-<td align="left" valign="top" width="25%">
-<%@ include file="cluster.jsp" %>
-</td>
-
-</tr>
-</table>
-
-<% } %>
-
-<%
-
-if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
-    || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))) {
-%>
-    <form name="next" action="../search.jsp" method="get">
-    <input type="hidden" name="query" value="<%=htmlQueryString%>">
-    <input type="hidden" name="lang" value="<%=queryLang%>">
-    <input type="hidden" name="start" value="<%=end%>">
-    <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
-    <input type="hidden" name="hitsPerSite" value="<%=hitsPerSite%>">
-    <input type="hidden" name="clustering" value="<%=clustering%>">
-    <input type="submit" value="<i18n:message key="next"/>">
-<% if (sort != null) { %>
-    <input type="hidden" name="sort" value="<%=sort%>">
-    <input type="hidden" name="reverse" value="<%=reverse%>">
-<% } %>
-    </form>
-<%
-    }
-
-if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
-%>
-    <form name="showAllHits" action="../search.jsp" method="get">
-    <input type="hidden" name="query" value="<%=htmlQueryString%>">
-    <input type="hidden" name="lang" value="<%=queryLang%>">
-    <input type="hidden" name="hitsPerPage" value="<%=hitsPerPage%>">
-    <input type="hidden" name="hitsPerSite" value="0">
-    <input type="hidden" name="clustering" value="<%=clustering%>">
-    <input type="submit" value="<i18n:message key="showAllHits"/>">
-<% if (sort != null) { %>
-    <input type="hidden" name="sort" value="<%=sort%>">
-    <input type="hidden" name="reverse" value="<%=reverse%>">
-<% } %>
-    </form>
-<%
-    }
-%>
-
-<table bgcolor="3333ff" align="right">
-<tr><td bgcolor="ff9900"><a href="<%=rss%>"><font color="ffffff"><b>RSS</b>
-</font></a></td></tr>
-</table>
-
-<p>
-<a href="http://wiki.apache.org/nutch/FAQ">
-<img border="0" src="../img/poweredbynutch_01.gif">
-</a>
-
-<jsp:include page="/include/footer.html"/>
-
-</body>
-</html>
Index: src/web/jsp/cached.jsp
===================================================================
--- src/web/jsp/cached.jsp	(revision 959954)
+++ src/web/jsp/cached.jsp	(working copy)
@@ -1,110 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page
-  session="false"
-  contentType="text/html; charset=UTF-8"
-  import="java.io.*"
-  import="java.util.*"
-
-  import="org.apache.nutch.searcher.*"
-  import="org.apache.nutch.parse.ParseData"
-  import="org.apache.nutch.metadata.Metadata"
-  import="org.apache.nutch.metadata.Nutch"
-  import="org.apache.hadoop.conf.Configuration"
-  import="org.apache.nutch.util.NutchConfiguration"
-%><%
-  Configuration nutchConf = NutchConfiguration.get(application);
-  NutchBean bean = NutchBean.get(application, nutchConf);
-  bean.LOG.info("cache request from " + request.getRemoteAddr());
-  Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    request.getParameter("id"));
-  HitDetails details = bean.getDetails(hit);
-  String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
-
-  String language =
-    ResourceBundle.getBundle("org.nutch.jsp.cached", request.getLocale())
-    .getLocale().getLanguage();
-
-  Metadata metaData = bean.getParseData(details).getContentMeta();
-
-  String content = null;
-  String contentType = (String) metaData.get(Metadata.CONTENT_TYPE);
-  if (contentType.startsWith("text/html")) {
-    // FIXME : it's better to emit the original 'byte' sequence 
-    // with 'charset' set to the value of 'CharEncoding',
-    // but I don't know how to emit 'byte sequence' in JSP.
-    // out.getOutputStream().write(bean.getContent(details)) may work, 
-    // but I'm not sure.
-    String encoding = (String) metaData.get("CharEncodingForConversion"); 
-    if (encoding != null) {
-      try {
-        content = new String(bean.getContent(details), encoding);
-      }
-      catch (UnsupportedEncodingException e) {
-        // fallback to windows-1252
-        content = new String(bean.getContent(details), "windows-1252");
-      }
-    }
-    else 
-      content = new String(bean.getContent(details));
-  }
-%>
-<!--
-<base href="<%=details.getValue("url")%>">
--->
-<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-<%
-  out.flush();
-%>
-<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
-<i18n:bundle baseName="org.nutch.jsp.cached"/>
-<h2 style="{color: rgb(255, 153, 0)}"><i18n:message key="title"/></h2>
-<h3>
-<i18n:message key="page">
-  <i18n:messageArg value="<%=details.getValue(\"url\")%>"/>
-</i18n:message>
-</h3>
-<hr>
-<!-- 
-   FIXME: have to sanitize 'content' : e.g. removing unncessary part
-        of head elememt
--->
-<%
-   String caching = details.getValue("cache");
-   String url = details.getValue("url");
-   if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
-%>
-Display of this content was administratively prohibited by the webmaster.
-You may visit the original page instead: <a href="<%=url%>"><%=url%></a>.
-<%
-     return;
-   }
-%>
-<% if (contentType.startsWith("text/html")) {%>
-
-<% if (content != null && !content.equals("")) {%>
-<%= content %>
-<% } else { %>
-<i18n:message key="noContent"/>
-<% } %>
-
-<% } else { %>
-
-The cached content has mime type "<%=contentType%>",
-click this <a href="./servlet/cached?<%=id%>">link</a> to download it directly.
-
-<% } %>
Index: src/web/jsp/index.jsp
===================================================================
--- src/web/jsp/index.jsp	(revision 959954)
+++ src/web/jsp/index.jsp	(working copy)
@@ -1,28 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page
-  session="false"
-  import="java.io.*"
-  import="java.util.*"
-%><%
-  String language =
-    ResourceBundle.getBundle("org.nutch.jsp.search", request.getLocale())
-    .getLocale().getLanguage();
-  String requestURI = HttpUtils.getRequestURL(request).toString();
-  String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-  response.sendRedirect(language + "/");
-%>
Index: src/web/jsp/refine-query.jsp
===================================================================
--- src/web/jsp/refine-query.jsp	(revision 959954)
+++ src/web/jsp/refine-query.jsp	(working copy)
@@ -1,53 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%
-
-// 20041129, Mike Pan and John Xing
-// Displays query-refinement hypertext based on ontology.
-// Try to be simple here. No mixing with other features such as clustering.
-// Please check refine-query-init.jsp, which does necessary initialization.
-
-List refineList = new ArrayList();
-
-if (ontology != null) {
-  Iterator iter = ontology.subclasses(queryString);
-  while (iter.hasNext()) {
-    refineList.add((String)iter.next());
-  }
-}
-
-bean.LOG.info("Outputting refine query list");
-
-if (refineList.size() > 0) {
-%>
-<div>
-Refine your search:
-<%
-  for (int i=0; i<refineList.size(); i++) {
-    String searchTerm = (String) refineList.get(i);
-    String searchTermHTML = org.apache.nutch.html.Entities.encode(searchTerm);
-    String searchQuery = "query="+searchTermHTML;
-    String searchURL = "/search.jsp?"+ searchQuery;
-%>
-<a href="<%=searchURL%>"><%=searchTerm%></a> |
-<%
-  }
-%>
-</div><br>
-<%
-}
-%>
Index: src/web/jsp/refine-query-init.jsp
===================================================================
--- src/web/jsp/refine-query-init.jsp	(revision 959954)
+++ src/web/jsp/refine-query-init.jsp	(working copy)
@@ -1,44 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%
-
-// 20041129, Mike Pan and John Xing
-// Initiates Ontology ontology and loads in all owl files.
-// Any problem (e.g., missing owl file or exception) will have this plugin
-// siliently ignored.
-// Please check ./refine-query.jsp, which provides query-refinement hypertext.
-
-org.apache.nutch.ontology.Ontology ontology = null;
-
-// note: should we ignore plugin exceptions, or rethrow it below?
-// Rethrowing it effectively prevents the servlet class from
-// being loaded into the JVM. Need improvement in future.
-
-  try {
-    Configuration nutchConf = NutchConfiguration.get(application);
-    String urls = nutchConf.get("extension.ontology.urls");
-    ontology = new org.apache.nutch.ontology.OntologyFactory(nutchConf).getOntology();
-    if (urls==null || urls.trim().equals("")) {
-      // ignored siliently
-    } else {
-      ontology.load(urls.split("\\s+"));
-    }
-  } catch (Exception e) {
-    // ignored siliently 
-  }
-
-%>
Index: src/web/jsp/anchors.jsp
===================================================================
--- src/web/jsp/anchors.jsp	(revision 959954)
+++ src/web/jsp/anchors.jsp	(working copy)
@@ -1,86 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page 
-  session="false"
-  contentType="text/html; charset=UTF-8"
-  pageEncoding="UTF-8"
-
-  import="java.io.*"
-  import="java.util.*"
-
-  import="org.apache.nutch.html.Entities"
-  import="org.apache.nutch.searcher.*"
-  import="org.apache.hadoop.conf.Configuration"
-  import="org.apache.nutch.util.NutchConfiguration"
-%><%
-  Configuration nutchConf = NutchConfiguration.get(application);
-  NutchBean bean = NutchBean.get(application, nutchConf);
-  // set the character encoding to use when interpreting request values 
-  request.setCharacterEncoding("UTF-8");
-  bean.LOG.info("anchors request from " + request.getRemoteAddr());
-  Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    request.getParameter("id"));
-  HitDetails details = bean.getDetails(hit);
-  String language =
-    ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())
-    .getLocale().getLanguage();
-  String requestURI = HttpUtils.getRequestURL(request).toString();
-  String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<%
-  // To prevent the character encoding declared with 'contentType' page
-  // directive from being overriden by JSTL (apache i18n), we freeze it
-  // by flushing the output buffer. 
-  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
-  out.flush();
-%>
-<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
-<i18n:bundle baseName="org.nutch.jsp.anchors"/>
-<html lang="<%= language %>">
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-<head>
-<title>Nutch: <i18n:message key="title"/></title>
-<jsp:include page="/include/style.html"/>
-<base href="<%= base + "/" + language + "/" %>">
-</head>
-
-<body>
-
-<jsp:include page="<%= language + \"/include/header.html\"%>"/>
-
-<h3>
-<i18n:message key="page">
-  <i18n:messageArg value="<%=details.getValue(\"url\")%>"/>
-</i18n:message>
-</h3>
-
-<h3><i18n:message key="anchors"/></h3>
-
-<ul>
-<%
-  String[] anchors = bean.getAnchors(details);
-  if (anchors != null) {
-    for (int i = 0; i < anchors.length; i++) {
-%><li><%=Entities.encode(anchors[i])%>
-<%   } %>
-<% } %>
-</ul>
-     
-<jsp:include page="/include/footer.html"/>
-
-</body>     
-</html>
Index: src/web/jsp/explain.jsp
===================================================================
--- src/web/jsp/explain.jsp	(revision 959954)
+++ src/web/jsp/explain.jsp	(working copy)
@@ -1,81 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%@ page 
-  session="false"
-  contentType="text/html; charset=UTF-8"
-  pageEncoding="UTF-8" 
-
-  import="java.io.*"
-  import="java.util.*"
-  import="org.apache.nutch.searcher.*"
-  import="org.apache.hadoop.conf.Configuration"
-  import="org.apache.nutch.util.NutchConfiguration"
-%><%
-  Configuration nutchConf = NutchConfiguration.get(application);
-  NutchBean bean = NutchBean.get(application, nutchConf);
-  // set the character encoding to use when interpreting request values 
-  request.setCharacterEncoding("UTF-8");
-  bean.LOG.info("explain request from " + request.getRemoteAddr());
-  Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    request.getParameter("id"));
-  HitDetails details = bean.getDetails(hit);
-  // get the lang from request
-  String queryLang = request.getParameter("lang");
-  if (queryLang == null) { queryLang = ""; }
-  Query query = Query.parse(request.getParameter("query"), queryLang, nutchConf);
-  String language =
-    ResourceBundle.getBundle("org.nutch.jsp.explain", request.getLocale())
-    .getLocale().getLanguage();
-  String requestURI = HttpUtils.getRequestURL(request).toString();
-  String base = requestURI.substring(0, requestURI.lastIndexOf('/'));
-%><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<%
-  // To prevent the character encoding declared with 'contentType' page
-  // directive from being overriden by JSTL (apache i18n), we freeze it
-  // by flushing the output buffer. 
-  // see http://java.sun.com/developer/technicalArticles/Intl/MultilingualJSP/
-  out.flush();
-%>
-<%@ taglib uri="http://jakarta.apache.org/taglibs/i18n" prefix="i18n" %>
-<i18n:bundle baseName="org.nutch.jsp.explain"/>
-<html lang="<%= language %>">
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-<head>
-<title>Nutch: <i18n:message key="title"/></title>
-<jsp:include page="/include/style.html"/>
-<base href="<%= base  + "/" + language %>/">
-</head>
-
-<body>
-
-<jsp:include page="<%= language + \"/include/header.html\"%>"/>
-
-<h3><i18n:message key="page"/></h3>
-
-<%=bean.getDetails(hit).toHtml()%>
-
-<h3><i18n:message key="scoreForQuery">
-  <i18n:messageArg value="<%=query%>"/>
-</i18n:message>
-</h3>
-
-<%=bean.getExplanation(query, hit)%>
-
-<jsp:include page="/include/footer.html"/>
-
-</body>     
-</html>
Index: src/web/jsp/cluster.jsp
===================================================================
--- src/web/jsp/cluster.jsp	(revision 959954)
+++ src/web/jsp/cluster.jsp	(working copy)
@@ -1,105 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%
-
-// @author Dawid Weiss
-//
-// PERFORMANCE/USER INTERFACE NOTE:
-//
-// What I do here is merely a demonstration. In real life the clustering
-// process should be done in a separate "processing" stream, most likely
-// a separate HTML frame that the user's browser requests data to.
-// We don't want the user to wait with plain snippets until the clusters
-// are created.
-//
-// Also: clustering is resource consuming, so a cache of recent queries 
-// would be in place. Besides, such cache would also be beneficial for the
-// purpose of re-querying existing clusters (remember that the
-// clustering extension may be a heuristic returning a DIFFERENT set of
-// clusters for an identical input).
-// See www.vivisimo.com for details of how this can be done using frames, or
-// http://carrot.cs.put.poznan.pl for an example of a Javascript solution.
-
-// cluster the hits
-HitsCluster [] clusters = null;
-if (clusterer != null) {
-  final long clusteringStart = System.currentTimeMillis();
-  try {
-    clusters = clusterer.clusterHits( details, Summary.toStrings(summaries) );
-    final long clusteringDuration = System.currentTimeMillis() - clusteringStart;
-    bean.LOG.info("Clustering took: " + clusteringDuration + " milliseconds.");
-  } catch (Exception e) {
-    // failed to do clustering (see below)
-  }
-}
-
-if (clusterer == null) {
-  %>No clustering extension found.<%
-} else {
-  if (clusters == null) {
-    %>Unable to do clustering.<%
-  } else if (clusters.length == 0) {
-    %>No clusters found.<%
-  } else {
-    // display top N clusters and top Q documents inside them.
-    int N = 10;
-    int Q = 3;
-    int maxLabels = 2;
-    
-    int displayCounter = 0;
-    N = Math.min(N, clusters.length );
-
-    for (int clusterIndex = 0 ; clusterIndex < N ; clusterIndex++) {
-      HitsCluster cluster = clusters[ clusterIndex ];
-      String [] clusterLabels = cluster.getDescriptionLabels();
-      
-      // probably leave it on for now
-      //if (cluster.isJunkCluster()) continue;
-
-      // output cluster label.
-      %><div style="margin: 0px; padding: 0px; font-weight: bold;"><%
-      for (int k=0;k<maxLabels && k<clusterLabels.length;k++) {
-        if (k>0) out.print(", ");
-        out.print( Entities.encode(clusterLabels[k]) );
-      }
-      %></div><%
-       
-      // now output sample documents from the inside
-      HitDetails[] documents = cluster.getHits();
-      if (documents.length > 0) {
-        %><ul style="font-size: 90%; margin-top: .5em;"><%
-        for (int k = 0; k < Q && k < documents.length; k++) {
-          HitDetails detail = documents[ k ];
-          String title = detail.getValue("title");
-          String url = detail.getValue("url");
-          if (title == null || title.equals("")) title = url;
-          if (title.length() > 35) title = title.substring(0,35) + "...";
-          %>
-            <li><a href="<%=url%>"><%= Entities.encode(title) %></a></li>
-          <%
-        }
-        %></ul><%
-      }
-       
-      // ignore subclusters for now, ALTHOUGH HIERARCHICAL CLUSTERING
-      // METHODS DO EXIST AND ARE VERY USEFUL
-      // HitsCluster [] subclusters = cluster.getSubclusters();
-    }
-  }
-}
-
-%>
Index: src/web/jsp/more.jsp
===================================================================
--- src/web/jsp/more.jsp	(revision 959954)
+++ src/web/jsp/more.jsp	(working copy)
@@ -1,68 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%>
-<%
-    // @author John Xing
-    // show meta info (currently type, size, date of last-modified)
-    // for each hit. These info are indexed by ./src/plugin/index-more.
-
-    // do not show unless we have something
-    boolean showMore = false;
-
-    // Content-Type
-    String primaryType = detail.getValue("primaryType");
-    String subType = detail.getValue("subType");
-
-    String contentType = subType;
-    if (contentType == null)
-      contentType = primaryType;
-    if (contentType != null) {
-      contentType = "[<span class=\"contentType\">" + contentType + "</span>]";
-      showMore = true;
-    } else {
-      contentType = "";
-    }
-
-    // Content-Length
-    String contentLength = detail.getValue("contentLength");
-    if (contentLength != null) {
-      contentLength = "(" + contentLength + " bytes)";
-      showMore = true;
-    } else {
-      contentLength = "";
-    }
-
-    // Last-Modified
-    String lastModified = detail.getValue("lastModified");
-    if (lastModified != null) {
-      Calendar cal = new GregorianCalendar();
-      cal.setTimeInMillis(new Long(lastModified).longValue());
-      lastModified = cal.get(Calendar.YEAR)
-                  + "." + (1+cal.get(Calendar.MONTH)) // it is 0-based
-                  + "." + cal.get(Calendar.DAY_OF_MONTH);
-      showMore = true;
-    } else {
-      lastModified = "";
-    }
-%>
-
-<% if (showMore) {
-    if ("text".equalsIgnoreCase(primaryType)) { %>
-    <br><font size=-1><nobr><%=contentType%> <%=contentLength%> <%=lastModified%></nobr></font>
-<%  } else { %>
-    <br><font size=-1><nobr><%=contentType%> <%=contentLength%> <%=lastModified%> - <a href="../text.jsp?<%=id%>"><i18n:message key="viewAsText"/></a></nobr></font>
-<%  }
-  } %>
Index: src/web/web.xml
===================================================================
--- src/web/web.xml	(revision 959954)
+++ src/web/web.xml	(working copy)
@@ -1,70 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!DOCTYPE web-app
-    PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
-    "http://java.sun.com/dtd/web-app_2_3.dtd">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<web-app>
-
-<!-- order is very important here -->
-
-<listener>
-  <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class>
-</listener>
-
-<servlet>
-  <servlet-name>Cached</servlet-name>
-  <servlet-class>org.apache.nutch.servlet.Cached</servlet-class>
-</servlet>
-
-<servlet>
-  <servlet-name>OpenSearch</servlet-name>
-  <servlet-class>org.apache.nutch.searcher.OpenSearchServlet</servlet-class>
-</servlet>
-
-<servlet>
-  <servlet-name>SearchServlet</servlet-name>
-  <servlet-class>org.apache.nutch.searcher.response.SearchServlet</servlet-class>
-</servlet>
-
-<servlet-mapping>
-  <servlet-name>Cached</servlet-name>
-  <url-pattern>/servlet/cached</url-pattern>
-</servlet-mapping>
-
-<servlet-mapping>
-  <servlet-name>OpenSearch</servlet-name>
-  <url-pattern>/opensearch</url-pattern>
-</servlet-mapping>
-
-<servlet-mapping>
-  <servlet-name>SearchServlet</servlet-name>
-  <url-pattern>/search</url-pattern>
-</servlet-mapping>
-
-<welcome-file-list>
-  <welcome-file>search.html</welcome-file>
-  <welcome-file>index.html</welcome-file>
-  <welcome-file>index.jsp</welcome-file>
-</welcome-file-list>
-
-<taglib>
-  <taglib-uri>http://jakarta.apache.org/taglibs/i18n</taglib-uri>
-  <taglib-location>/WEB-INF/taglibs-i18n.tld</taglib-location>
- </taglib>
-
-</web-app>
Index: src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
===================================================================
--- src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java	(revision 959954)
+++ src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java	(working copy)
@@ -25,7 +25,6 @@
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.nutch.parse.Parse;
 
 /**
@@ -57,9 +56,4 @@
     return doc;
   }
 
-  public void addIndexBackendOptions(Configuration conf) {
-    LuceneWriter.addFieldOptions("anchor", LuceneWriter.STORE.NO,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-  }
-
 }
Index: src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
===================================================================
--- src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java	(revision 959954)
+++ src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java	(working copy)
@@ -23,7 +23,6 @@
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.parse.Parse;
 
@@ -60,11 +59,6 @@
     return doc;
   }
 
-  public void addIndexBackendOptions(Configuration conf) {
-    LuceneWriter.addFieldOptions("tag", LuceneWriter.STORE.YES,
-        LuceneWriter.INDEX.UNTOKENIZED, conf);
-  }
-  
   /* ----------------------------- *
    * <implementation:Configurable> *
    * ----------------------------- */
Index: src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
===================================================================
--- src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java	(revision 959954)
+++ src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java	(working copy)
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.microformats.reltag;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.searcher.RawFieldQueryFilter;
-
-
-/**
- * Handles <code>"tag:"<code> query clauses.
- * 
- * @see <a href="http://www.microformats.org/wiki/rel-tag">
- *      http://www.microformats.org/wiki/rel-tag</a>
- * @author J&eacute;r&ocirc;me Charron
- */
-public class RelTagQueryFilter extends RawFieldQueryFilter {
-  
-  private Configuration conf;
-
-  public RelTagQueryFilter() {
-    super("tag", true, 1.0f);
-  }
-  
-  
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    setBoost(conf.getFloat("query.tag.boost", 1.0f));
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-
-}
Index: src/plugin/microformats-reltag/plugin.xml
===================================================================
--- src/plugin/microformats-reltag/plugin.xml	(revision 959954)
+++ src/plugin/microformats-reltag/plugin.xml	(working copy)
@@ -45,17 +45,5 @@
                       class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
    </extension>
 
-
-   <extension id="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
-              name="Rel-Tag query filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="RelTagQueryFilter"
-                      class="org.apache.nutch.microformats.reltag.RelTagQueryFilter">
-        <parameter name="raw-fields" value="tag"/>
-      </implementation>
-      
-   </extension>
-
-
 </plugin>
 
Index: src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java	(revision 959954)
+++ src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java	(working copy)
@@ -26,6 +26,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
@@ -52,12 +53,12 @@
       CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
       throws ScoringFilterException {
 
-    List<String> tlds = doc.getFieldValues("tld");
+    NutchField tlds = doc.getField("tld");
     float boost = 1.0f;
 
     if(tlds != null) {
-      for(String tld : tlds) {
-        DomainSuffix entry = tldEntries.get(tld);
+      for(Object tld : tlds.getValues()) {
+        DomainSuffix entry = tldEntries.get(tld.toString());
         if(entry != null)
           boost *= entry.getBoost();
       }
Index: src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java	(revision 959954)
+++ src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java	(working copy)
@@ -28,7 +28,6 @@
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.domain.DomainSuffix;
@@ -65,10 +64,4 @@
   public Configuration getConf() {
     return this.conf;
   }
-
-  public void addIndexBackendOptions(Configuration conf) {
-    // store, no index
-    LuceneWriter.addFieldOptions("tld", LuceneWriter.STORE.YES,
-                                 LuceneWriter.INDEX.NO, conf);
-  }
 }
Index: src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
===================================================================
--- src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java	(revision 959954)
+++ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java	(working copy)
@@ -28,7 +28,6 @@
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
@@ -94,40 +93,6 @@
     return doc;
   }
 
-  public void addIndexBackendOptions(Configuration conf) {
-
-    ///////////////////////////
-    //    add lucene options   //
-    ///////////////////////////
-
-    // host is un-stored, indexed and tokenized
-    LuceneWriter.addFieldOptions("host", LuceneWriter.STORE.NO,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-
-    // site is un-stored, indexed and un-tokenized
-    LuceneWriter.addFieldOptions("site", LuceneWriter.STORE.NO,
-        LuceneWriter.INDEX.UNTOKENIZED, conf);
-
-    // url is both stored and indexed, so it's both searchable and returned
-    LuceneWriter.addFieldOptions("url", LuceneWriter.STORE.YES,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-
-    // content is indexed, so that it's searchable, but not stored in index
-    LuceneWriter.addFieldOptions("content", LuceneWriter.STORE.NO,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-
-    // anchors are indexed, so they're searchable, but not stored in index
-    LuceneWriter.addFieldOptions("anchor", LuceneWriter.STORE.NO,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-
-    // title is indexed and stored so that it can be displayed
-    LuceneWriter.addFieldOptions("title", LuceneWriter.STORE.YES,
-        LuceneWriter.INDEX.TOKENIZED, conf);
-
-    LuceneWriter.addFieldOptions("cache", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);
-    LuceneWriter.addFieldOptions("tstamp", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, conf);
-  }
-
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java	(revision 959954)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java	(working copy)
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis.lang;
-
-import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.hadoop.conf.Configuration;
-
-/** Handles "lang:" query clauses, causing them to search the "lang" field
- * indexed by LanguageIdentifier. */
-public class LanguageQueryFilter extends RawFieldQueryFilter {
-  private Configuration conf;
-
-  public LanguageQueryFilter() {
-    super("lang");
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    setBoost(conf.getFloat("query.lang.boost", 0.0f));
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java	(revision 959954)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java	(working copy)
@@ -23,7 +23,6 @@
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.metadata.Metadata;
@@ -95,11 +94,6 @@
     return doc;
   }
 
-  public void addIndexBackendOptions(Configuration conf) {
-    LuceneWriter.addFieldOptions("lang", LuceneWriter.STORE.YES,
-        LuceneWriter.INDEX.UNTOKENIZED, conf);
-  }
-  
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.languageIdentifier = new LanguageIdentifier(conf);
Index: src/plugin/languageidentifier/plugin.xml
===================================================================
--- src/plugin/languageidentifier/plugin.xml	(revision 959954)
+++ src/plugin/languageidentifier/plugin.xml	(working copy)
@@ -45,16 +45,5 @@
                       class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
    </extension>
 
-
-   <extension id="org.apache.nutch.analysis.lang.LanguageQueryFilter"
-              name="Nutch Language Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="LanguageQueryFilter"
-                      class="org.apache.nutch.analysis.lang.LanguageQueryFilter">
-        <parameter name="raw-fields" value="lang"/>
-      </implementation>
-   </extension>
-
-
 </plugin>
 
Index: src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java
===================================================================
--- src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java	(revision 959954)
+++ src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java	(working copy)
@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.searcher.response.xml;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.html.Entities;
-import org.apache.nutch.searcher.Hit;
-import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.searcher.Summary;
-import org.apache.nutch.searcher.response.ResponseWriter;
-import org.apache.nutch.searcher.response.SearchResults;
-import org.w3c.dom.Attr;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-
-/**
- * A ResponseWriter implementation that returns search results in XML format.
- */
-public class XMLResponseWriter
-  implements ResponseWriter {
-
-  private String contentType = null;
-  private Configuration conf;
-  private int maxAgeInSeconds;
-  private boolean prettyPrint;
-
-  /**
-   * Creates and returns a new node within the XML document.
-   * 
-   * @param doc The XML document.
-   * @param parent The parent Node.
-   * @param name The name of the new node.
-   * 
-   * @return The newly created node Element.
-   */
-  private static Element addNode(Document doc, Node parent, String name) {
-    Element child = doc.createElement(name);
-    parent.appendChild(child);
-    return child;
-  }
-
-  /**
-   * Creates and returns a new node within the XML document.  The node contains
-   * the text supplied as a child node.
-   * 
-   * @param doc The XML document.
-   * @param parent The parent Node.
-   * @param name The name of the new node.
-   * @param text A text string to append as a child node.
-   * 
-   * @return The newly created node Element.
-   */
-  private static void addNode(Document doc, Node parent, String name,
-    String text) {
-    Element child = doc.createElement(name);
-    child.appendChild(doc.createTextNode(getLegalXml(text)));
-    parent.appendChild(child);
-  }
-
-  /**
-   * Adds an attribute name and value to a node Element in the XML document.
-   * 
-   * @param doc The XML document.
-   * @param node The node Element on which to attach the attribute.
-   * @param name The name of the attribute.
-   * @param value The value of the attribute.
-   */
-  private static void addAttribute(Document doc, Element node, String name,
-    String value) {
-    Attr attribute = doc.createAttribute(name);
-    attribute.setValue(getLegalXml(value));
-    node.getAttributes().setNamedItem(attribute);
-  }
-
-  /**
-   * Transforms and returns the text string as legal XML text.
-   * 
-   * @param text The text to transform.
-   * 
-   * @return The text string in the form of legal XML text.
-   */
-  protected static String getLegalXml(String text) {
-    
-    if (text == null) {
-      return null;
-    }
-    StringBuffer buffer = null;
-    for (int i = 0; i < text.length(); i++) {
-      char c = text.charAt(i);
-      if (!isLegalXml(c)) {
-        if (buffer == null) {
-          buffer = new StringBuffer(text.length());
-          buffer.append(text.substring(0, i));
-        }
-      }
-      else {
-        if (buffer != null) {
-          buffer.append(c);
-        }
-      }
-    }
-    return (buffer != null) ? buffer.toString() : text;
-  }
-
-  /**
-   * Determines if the character is a legal XML character.
-   * 
-   * @param c The character to check.
-   * 
-   * @return True if the character is legal xml, false otherwise.
-   */
-  private static boolean isLegalXml(final char c) {
-    return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
-      || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
-  }
-
-  public void setContentType(String contentType) {
-    this.contentType = contentType;
-  }
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxAgeInSeconds = conf.getInt("searcher.response.maxage", 86400);
-    this.prettyPrint = conf.getBoolean("searcher.response.prettyprint", true);
-  }
-
-  public void writeResponse(SearchResults results, HttpServletRequest request,
-    HttpServletResponse response)
-    throws IOException {
-
-    try {
-      
-      // create the xml document and add the results and search nodes
-      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-      Document xmldoc = factory.newDocumentBuilder().newDocument();
-      Element resEl = addNode(xmldoc, xmldoc, "results");
-      Element searchEl = addNode(xmldoc, resEl, "search");
-      
-      // add common nodes
-      String query = results.getQuery();
-      addNode(xmldoc, searchEl, "query", query);
-      addNode(xmldoc, searchEl, "totalhits",
-        String.valueOf(results.getTotalHits()));
-      String lang = results.getLang();
-      if (lang != null) {
-        addNode(xmldoc, searchEl, "lang", lang);
-      }
-      String sort = results.getSort();
-      if (sort != null) {
-        addNode(xmldoc, searchEl, "sort", sort);
-      }
-      addNode(xmldoc, searchEl, "reverse", results.isReverse() ? "true"
-        : "false");
-      addNode(xmldoc, searchEl, "start", String.valueOf(results.getStart()));
-      addNode(xmldoc, searchEl, "end", String.valueOf(results.getEnd()));
-      addNode(xmldoc, searchEl, "rows", String.valueOf(results.getRows()));
-      addNode(xmldoc, searchEl, "totalhits",
-        String.valueOf(results.getTotalHits()));
-      addNode(xmldoc, searchEl, "withSummary",
-        String.valueOf(results.isWithSummary()));
-
-      String[] searchFields = results.getFields();
-      Set<String> fieldSet = new HashSet<String>();
-      if (searchFields != null && searchFields.length > 0) {
-        addNode(xmldoc, searchEl, "fields", StringUtils.join(searchFields, ","));
-        for (int i = 0; i < searchFields.length; i++) {
-          fieldSet.add(searchFields[i]);
-        }
-      }
-
-      // add documents
-      Element documents = addNode(xmldoc, resEl, "documents");
-      HitDetails[] details = results.getDetails();
-      Hit[] hits = results.getHits();
-      Summary[] summaries = results.getSummaries();
-      for (int i = 0; i < details.length; i++) {
-
-        // every document has an indexno and an indexdocno
-        Element document = addNode(xmldoc, documents, "document");
-        addAttribute(xmldoc, document, "indexno",
-          String.valueOf(hits[i].getIndexNo()));
-        addAttribute(xmldoc, document, "indexkey",
-          String.valueOf(hits[i].getUniqueKey()));
-        
-        // don't add summaries not including summaries
-        if (summaries != null && results.isWithSummary()) {
-          String encSumm = Entities.encode(summaries[i].toString());
-          addNode(xmldoc, document, "summary", encSumm);
-        }
-
-        // add the fields from hit details
-        Element fields = addNode(xmldoc, document, "fields");
-        HitDetails detail = details[i];
-        for (int j = 0; j < detail.getLength(); j++) {
-          String fieldName = detail.getField(j);
-          String[] fieldValues = detail.getValues(fieldName);
-          
-          // if we specified fields to return, only return those fields
-          if (fieldSet.size() == 0 || fieldSet.contains(fieldName)) {
-            Element field = addNode(xmldoc, fields, "field");
-            addAttribute(xmldoc, field, "name", fieldName);
-            for (int k = 0; k < fieldValues.length; k++) {
-              String encFieldVal = Entities.encode(fieldValues[k]);
-              addNode(xmldoc, field, "value", encFieldVal);
-            }
-          }
-        }
-      }
-
-      // get the xml source and a transformer to print it out
-      DOMSource source = new DOMSource(xmldoc);
-      TransformerFactory transFactory = TransformerFactory.newInstance();
-      Transformer transformer = transFactory.newTransformer();
-      
-      // pretty printing can be set through configuration
-      if (prettyPrint) {
-        transformer.setOutputProperty("indent", "yes");
-        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
-        transformer.setOutputProperty(
-          "{http://xml.apache.org/xslt}indent-amount", "2");
-      }
-      
-      // write out the content to a byte array
-      ByteArrayOutputStream baos = new ByteArrayOutputStream();
-      StreamResult result = new StreamResult(baos);
-      transformer.transform(source, result);
-      baos.flush();
-      baos.close();
-
-      // cache control headers
-      SimpleDateFormat sdf = new SimpleDateFormat(
-        "E, d MMM yyyy HH:mm:ss 'GMT'");
-      long relExpiresInMillis = System.currentTimeMillis()
-        + (1000 * maxAgeInSeconds);
-      response.setContentType(contentType);
-      response.setHeader("Cache-Control", "max-age=" + maxAgeInSeconds);
-      response.setHeader("Expires", sdf.format(relExpiresInMillis));
-      
-      // write out the content to the response
-      response.getOutputStream().write(baos.toByteArray());
-      response.flushBuffer();
-    }
-    catch (Exception e) {
-      throw new IOException(e);
-    }
-
-  }
-}
Index: src/plugin/response-xml/plugin.xml
===================================================================
--- src/plugin/response-xml/plugin.xml	(revision 959954)
+++ src/plugin/response-xml/plugin.xml	(working copy)
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="response-xml"
-   name="XML Response Writer Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="response-xml.jar">
-         <export name="*"/>
-      </library>    
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.searcher.response"
-              name="ResponseWriter"
-              point="org.apache.nutch.searcher.response.ResponseWriter">
-
-      <implementation id="org.apache.nutch.searcher.response.xml.XMLResponseWriter"
-        class="org.apache.nutch.searcher.response.xml.XMLResponseWriter">
-        <parameter name="responseType" value="xml"/>
-        <parameter name="contentType" value="text/xml"/>
-      </implementation>
-
-   </extension>
-
-</plugin>
Index: src/plugin/response-xml/build.xml
===================================================================
--- src/plugin/response-xml/build.xml	(revision 959954)
+++ src/plugin/response-xml/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="response-xml" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 959954)
+++ src/plugin/build.xml	(working copy)
@@ -26,14 +26,11 @@
   <!-- Build & deploy all the plugin jars.                    -->
   <!-- ====================================================== -->
   <target name="deploy">
-     <ant dir="clustering-carrot2" target="deploy"/>
      <ant dir="creativecommons" target="deploy"/>
      <ant dir="feed" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
-     <ant dir="field-basic" target="deploy"/>
-     <ant dir="field-boost" target="deploy"/>
      <ant dir="languageidentifier" target="deploy"/>
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-lucene-analyzers" target="deploy"/>
@@ -42,7 +39,6 @@
      <ant dir="lib-xml" target="deploy"/>
      <ant dir="microformats-reltag" target="deploy"/>
      <ant dir="nutch-extensionpoints" target="deploy"/>
-     <ant dir="ontology" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
@@ -53,18 +49,9 @@
      <ant dir="parse-swf" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
      <ant dir="parse-zip" target="deploy"/>
-     <ant dir="query-basic" target="deploy"/>
-     <ant dir="query-more" target="deploy"/>
-     <ant dir="query-site" target="deploy"/>
-     <ant dir="query-custom" target="deploy"/>
-     <ant dir="query-url" target="deploy"/>
-     <ant dir="response-json" target="deploy"/>
-     <ant dir="response-xml" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
-     <ant dir="summary-basic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
-     <ant dir="summary-lucene" target="deploy"/>
      <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-domain" target="deploy" />
@@ -86,7 +73,6 @@
      <ant dir="index-more" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="lib-http" target="test"/>
-     <ant dir="ontology" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-rss" target="test"/>
@@ -94,7 +80,6 @@
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="parse-zip" target="test"/>
-     <ant dir="query-url" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test" />
@@ -110,16 +95,11 @@
   <!-- Clean all of the plugins.                              -->
   <!-- ====================================================== -->
   <target name="clean">
-    <ant dir="analysis-de" target="clean"/>
-    <ant dir="analysis-fr" target="clean"/>
-    <ant dir="clustering-carrot2" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
     <ant dir="feed" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-more" target="clean"/>
-    <ant dir="field-basic" target="clean"/>
-    <ant dir="field-boost" target="clean"/>  	
     <ant dir="languageidentifier" target="clean"/>
     <ant dir="lib-commons-httpclient" target="clean"/>
     <ant dir="lib-http" target="clean"/>
@@ -129,7 +109,6 @@
     <ant dir="lib-xml" target="clean"/>
     <ant dir="microformats-reltag" target="clean"/>
     <ant dir="nutch-extensionpoints" target="clean"/>
-    <ant dir="ontology" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
@@ -140,18 +119,9 @@
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
-    <ant dir="query-basic" target="clean"/>
-    <ant dir="query-more" target="clean"/>
-    <ant dir="query-site" target="clean"/>
-    <ant dir="query-url" target="clean"/>
-    <ant dir="query-custom" target="clean"/>
-    <ant dir="response-json" target="clean"/>
-    <ant dir="response-xml" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
     <ant dir="scoring-link" target="clean"/>
     <ant dir="subcollection" target="clean"/>
-    <ant dir="summary-basic" target="clean"/>
-    <ant dir="summary-lucene" target="clean"/>
     <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-domain" target="clean" />
Index: src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
===================================================================
--- src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java	(revision 959954)
+++ src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java	(working copy)
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher.more;
-
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.searcher.QueryFilter;
-import org.apache.nutch.searcher.QueryException;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.TermRangeQuery;
-
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-
-
-/**
- * Handles "date:" query clauses, causing them to search the field "date"
- * indexed by MoreIndexingFilter.java
- *
- * @author John Xing
- */
-public class DateQueryFilter implements QueryFilter {
-
-  public static final Log LOG = LogFactory.getLog(DateQueryFilter.class);
-
-  private static final String FIELD_NAME = "date";
-
-  // query syntax is defined as date:yyyymmdd-yyyymmdd
-  private static final Pattern pattern = Pattern.compile("^(\\d{8})-(\\d{8})$");
-
-  private Configuration conf;
-    
-  public BooleanQuery filter(Query input, BooleanQuery output)
-    throws QueryException {
-
-    // examine each clause in the Nutch query
-    Clause[] clauses = input.getClauses();
-    
-    for (int i = 0; i <clauses.length; i++) { 
-      Clause c = clauses[i];
-      
-      //skip if not date clauses
-      if (!c.getField().equals(FIELD_NAME)) 
-        continue;
-            
-      String x = c.getTerm().toString();
-       
-      Matcher matcher = pattern.matcher(x);
-      if (!matcher.matches()) {
-        throw new QueryException("Wrong query syntax "+FIELD_NAME+":"+x);
-      }
-
-      // do it as lucene RangeQuery
-      String xLower = matcher.group(1);
-      String xUpper = matcher.group(2);
-
-      // inclusive
-      TermRangeQuery rangeQuery = new TermRangeQuery(
-    		  c.getField(), xLower, xUpper, true, true);
-
-      rangeQuery.setBoost(0.0f);                  // trigger filterization
-          
-      output.add(rangeQuery,
-          (c.isProhibited()
-              ? BooleanClause.Occur.MUST_NOT
-              : (c.isRequired()
-                  ? BooleanClause.Occur.MUST
-                  : BooleanClause.Occur.SHOULD
-                 )
-           ));
-             
-    }
-
-    return output;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/plugin/query-more/src/java/org/apache/nutch/searcher/more/TypeQueryFilter.java
===================================================================
--- src/plugin/query-more/src/java/org/apache/nutch/searcher/more/TypeQueryFilter.java	(revision 959954)
+++ src/plugin/query-more/src/java/org/apache/nutch/searcher/more/TypeQueryFilter.java	(working copy)
@@ -1,45 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher.more;
-
-import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Handles "type:" query clauses, causing them to search the field
- * indexed by MoreIndexingFilter.
- *
- * @author John Xing
- */
-
-public class TypeQueryFilter extends RawFieldQueryFilter {
-  private Configuration conf;
-
-  public TypeQueryFilter() {
-    super("type");
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    setBoost(conf.getFloat("query.type.boost", 0.0f));
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/plugin/query-more/src/java/org/apache/nutch/searcher/more/package.html
===================================================================
--- src/plugin/query-more/src/java/org/apache/nutch/searcher/more/package.html	(revision 959954)
+++ src/plugin/query-more/src/java/org/apache/nutch/searcher/more/package.html	(working copy)
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>A more query plugin.</p>
-</body>
-</html>
Index: src/plugin/query-more/plugin.xml
===================================================================
--- src/plugin/query-more/plugin.xml	(revision 959954)
+++ src/plugin/query-more/plugin.xml	(working copy)
@@ -1,54 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="query-more"
-   name="More Query Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="query-more.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.searcher.more"
-              name="Nutch More Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="TypeQueryFilter"
-                      class="org.apache.nutch.searcher.more.TypeQueryFilter">
-        <parameter name="raw-fields" value="type"/>
-      </implementation>
-      
-   </extension>
-
-   <extension id="org.apache.nutch.searcher.more"
-              name="Nutch More Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="DateQueryFilter"
-                      class="org.apache.nutch.searcher.more.DateQueryFilter">
-        <parameter name="raw-fields" value="date"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>
Index: src/plugin/query-more/build.xml
===================================================================
--- src/plugin/query-more/build.xml	(revision 959954)
+++ src/plugin/query-more/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="query-more" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/summary-lucene/lib/lucene-highlighter-3.0.1.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java
===================================================================
--- src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java	(revision 959954)
+++ src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java	(working copy)
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.summary.lucene;
-
-// JDK imports
-import java.io.StringReader;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Lucene imports
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.search.highlight.Formatter;
-import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
-import org.apache.lucene.search.highlight.WeightedTerm;
-import org.apache.lucene.search.highlight.WeightedSpanTerm;
-
-// Nutch imports
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.Summarizer;
-import org.apache.nutch.searcher.Summary;
-import org.apache.nutch.searcher.Summary.Ellipsis;
-import org.apache.nutch.searcher.Summary.Fragment;
-import org.apache.nutch.searcher.Summary.Highlight;
-
-
-/** Implements hit summarization. */
-public class LuceneSummarizer implements Summarizer {
-  
-  private final static String SEPARATOR = "###";
-  private final static Formatter FORMATTER =
-          new SimpleHTMLFormatter(SEPARATOR, SEPARATOR);
-
-  /** Converts text to tokens. */
-  private Analyzer analyzer = null;
-  private Configuration conf = null;
-  
-  public LuceneSummarizer() { }
-  
-  private LuceneSummarizer(Configuration conf) {
-    setConf(conf);
-  }
-  
-  
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-  
-  public Configuration getConf() {
-    return conf;
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.analyzer = new NutchDocumentAnalyzer(conf);
-  }
-  
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-  
-  
-  /* --------------------------- *
-   * <implementation:Summarizer> *
-   * --------------------------- */
-  
-  public Summary getSummary(String text, Query query) {
-
-    String[] terms = query.getTerms();
-    WeightedSpanTerm[] weighted = new WeightedSpanTerm[terms.length];
-    for (int i=0; i<terms.length; i++) {
-      weighted[i] = new WeightedSpanTerm(1.0f, terms[i]);
-    }
-    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
-    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
-    Summary summary = new Summary();
-    try {
-      // TODO : The max number of fragments (3) should be configurable
-      String[] result = highlighter.getBestFragments(tokens, text, 3);
-      for (int i=0; i<result.length; i++) {
-        String[] parts = result[i].split(SEPARATOR);
-        boolean highlight = false;
-        for (int j=0; j<parts.length; j++) {
-          if (highlight) {
-            summary.add(new Highlight(parts[j]));
-          } else {
-            summary.add(new Fragment(parts[j]));
-          }
-          highlight = !highlight;
-        }
-        summary.add(new Ellipsis());
-      }
-    } catch (Exception e) {
-      // Nothing to do...
-    }
-    return summary;
-  }
-
-  /* ---------------------------- *
-   * </implementation:Summarizer> *
-   * ---------------------------- */
-  
-}
Index: src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html
===================================================================
--- src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html	(revision 959954)
+++ src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/package.html	(working copy)
@@ -1,7 +0,0 @@
-<html>
-<body>
-<p>
-A Lucene Highlighter based summarizer implementation.
-</p>
-</body>
-</html>
Index: src/plugin/summary-lucene/plugin.xml
===================================================================
--- src/plugin/summary-lucene/plugin.xml	(revision 959954)
+++ src/plugin/summary-lucene/plugin.xml	(working copy)
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="summary-lucene"
-   name="Lucene Highlighter Summary Plug-in"
-   version="1.0.0"
-   provider-name="org.apache.nutch">
-
-   <runtime>
-      <library name="summary-lucene.jar">
-         <export name="*"/>
-      </library>
-      <library name="lucene-highlighter-3.0.1.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.summary.basic"
-              name="Lucene Highlighter Summarizer"
-              point="org.apache.nutch.searcher.Summarizer">
-
-      <implementation id="Basic Summarizer"
-                      class="org.apache.nutch.summary.lucene.LuceneSummarizer"/>
-
-   </extension>
-
-</plugin>
Index: src/plugin/summary-lucene/build.xml
===================================================================
--- src/plugin/summary-lucene/build.xml	(revision 959954)
+++ src/plugin/summary-lucene/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="summary-lucene" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteQueryFilter.java
===================================================================
--- src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteQueryFilter.java	(revision 959954)
+++ src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteQueryFilter.java	(working copy)
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher.site;
-
-import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.hadoop.conf.Configuration;
-
-/** Handles "site:" query clauses, causing them to search the field indexed by
- * SiteIndexingFilter. */
-public class SiteQueryFilter extends RawFieldQueryFilter {
-  private Configuration conf;
-
-  public SiteQueryFilter() {
-    super("site");
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    setBoost(conf.getFloat("query.site.boost", 0.0f));
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/plugin/query-site/plugin.xml
===================================================================
--- src/plugin/query-site/plugin.xml	(revision 959954)
+++ src/plugin/query-site/plugin.xml	(working copy)
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="query-site"
-   name="Site Query Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="query-site.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.searcher.site.SiteQueryFilter"
-              name="Nutch Site Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="SiteQueryFilter"
-                      class="org.apache.nutch.searcher.site.SiteQueryFilter">
-        <parameter name="raw-fields" value="site"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>
Index: src/plugin/query-site/build.xml
===================================================================
--- src/plugin/query-site/build.xml	(revision 959954)
+++ src/plugin/query-site/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="query-site" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java
===================================================================
--- src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java	(revision 959954)
+++ src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java	(working copy)
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field.boost;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.field.FieldFilter;
-import org.apache.nutch.indexer.field.FieldType;
-import org.apache.nutch.indexer.field.FieldWritable;
-import org.apache.nutch.indexer.field.Fields;
-
-/**
- * A field filter that indexes fields of content type Boost or type Computation.
- * 
- * Boost fields are aggregated together to create a global score for a single 
- * Lucene document in the index.  An example of a Boost fields would be the 
- * LinkRank score.
- */
-public class BoostFieldFilter
-  implements FieldFilter {
-
-  public static final Log LOG = LogFactory.getLog(BoostFieldFilter.class);
-  private Configuration conf;
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  public Document filter(String url, Document doc, List<FieldWritable> fields)
-    throws IndexingException {
-
-    List<String> comps = new ArrayList<String>();
-    float boost = 0.0f;
-
-    for (FieldWritable field : fields) {
-
-      // save the boost factor as unindexed fields, to show different scoring
-      FieldType type = field.getType();
-      if (type == FieldType.BOOST) {
-        float fieldBoost = field.getBoost();
-        boost += fieldBoost;
-        doc.add(new Field(Fields.BOOSTFACTOR, field.getValue() + ": "
-          + fieldBoost, Field.Store.YES, Field.Index.NO));
-      }
-      else if (type == FieldType.COMPUTATION) {
-        comps.add(field.getValue());
-      }
-    }
-
-    // set the boost for the document and save it in the index
-    doc.setBoost(boost);
-    doc.add(new Field(Fields.BOOST, Float.toString(boost), Field.Store.YES,
-      Field.Index.NO));
-    
-    
-    return doc;
-  }
-
-}
Index: src/plugin/field-boost/plugin.xml
===================================================================
--- src/plugin/field-boost/plugin.xml	(revision 959954)
+++ src/plugin/field-boost/plugin.xml	(working copy)
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="field-boost"
-   name="Boost Field Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="field-boost.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.indexer.field.boost"
-              name="Nutch Boost Field Filter"
-              point="org.apache.nutch.indexer.field.FieldFilter">
-      <implementation id="BoostFieldFilter"
-        class="org.apache.nutch.indexer.field.boost.BoostFieldFilter"/>
-   </extension>
-
-</plugin>
Index: src/plugin/field-boost/build.xml
===================================================================
--- src/plugin/field-boost/build.xml	(revision 959954)
+++ src/plugin/field-boost/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="field-boost" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java
===================================================================
--- src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java	(revision 959954)
+++ src/plugin/analysis-de/src/java/org/apache/nutch/analysis/de/GermanAnalyzer.java	(working copy)
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis.de;
-
-// JDK imports
-import java.io.Reader;
-
-// Lucene imports
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.util.Version;
-
-// Nutch imports
-import org.apache.nutch.analysis.NutchAnalyzer;
-
-
-/**
- * A simple German Analyzer that wraps the Lucene one.
- * @author Jerome Charron
- */
-public class GermanAnalyzer extends NutchAnalyzer {
-    
-    private final static Analyzer ANALYZER = 
-            new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_CURRENT);
-
-    
-    /** Creates a new instance of FrenchAnalyzer */
-    public GermanAnalyzer() { }
-
-
-    public TokenStream tokenStream(String fieldName, Reader reader) {
-        return ANALYZER.tokenStream(fieldName, reader);
-    }
-
-}
Index: src/plugin/analysis-de/plugin.xml
===================================================================
--- src/plugin/analysis-de/plugin.xml	(revision 959954)
+++ src/plugin/analysis-de/plugin.xml	(working copy)
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="analysis-de"
-   name="German Analysis Plug-in"
-   version="1.0.0"
-   provider-name="org.apache.nutch">
-
-   <runtime>
-      <library name="analysis-de.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-lucene-analyzers"/>
-   </requires>
-
-   <extension id="org.apache.nutch.analysis.de"
-              name="GermanAnalyzer"
-              point="org.apache.nutch.analysis.NutchAnalyzer">
-
-      <implementation id="org.apache.nutch.analysis.de.GermanAnalyzer"
-                      class="org.apache.nutch.analysis.de.GermanAnalyzer">
-        <parameter name="lang" value="de"/>
-      </implementation>
-
-   </extension>
-
-</plugin>
Index: src/plugin/analysis-de/build.xml
===================================================================
--- src/plugin/analysis-de/build.xml	(revision 959954)
+++ src/plugin/analysis-de/build.xml	(working copy)
@@ -1,33 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="analysis-de" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-  
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-lucene-analyzers"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-lucene-analyzers/*.jar" />
-    </fileset>
-  </path>
-
-</project>
Index: src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java
===================================================================
--- src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java	(revision 959954)
+++ src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/BasicSummarizer.java	(working copy)
@@ -1,429 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.summary.basic;
-
-// JDK imports
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.List;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.Vector;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Lucene imports
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.*;
-
-// Nutch imports
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.Summarizer;
-import org.apache.nutch.searcher.Summary;
-import org.apache.nutch.searcher.Summary.Ellipsis;
-import org.apache.nutch.searcher.Summary.Fragment;
-import org.apache.nutch.searcher.Summary.Highlight;
-import org.apache.nutch.util.NutchConfiguration;
-
-
-/** Implements hit summarization. */
-public class BasicSummarizer implements Summarizer {
-  
-  private int sumContext = 5;
-  private int sumLength = 20;
-  private Analyzer analyzer = null;
-  private Configuration conf = null;
-  
-  private final static Comparator ORDER_COMPARATOR = new Comparator() {
-    public int compare(Object o1, Object o2) {
-      return ((Excerpt) o1).getOrder() - ((Excerpt) o2).getOrder();
-    }
-  };
-  
-  private final static Comparator SCORE_COMPARATOR = new Comparator() {
-    public int compare(Object o1, Object o2) {
-      Excerpt excerpt1 = (Excerpt) o1;
-      Excerpt excerpt2 = (Excerpt) o2;
-
-      if (excerpt1 == null && excerpt2 != null) {
-        return -1;
-      } else if (excerpt1 != null && excerpt2 == null) {
-        return 1;
-      } else if (excerpt1 == null && excerpt2 == null) {
-        return 0;
-      }
-
-      int numToks1 = excerpt1.numUniqueTokens();
-      int numToks2 = excerpt2.numUniqueTokens();
-
-      if (numToks1 < numToks2) {
-        return -1;
-      } else if (numToks1 == numToks2) {
-        return excerpt1.numFragments() - excerpt2.numFragments();
-      } else {
-        return 1;
-      }
-    }
-  };
-
-  
-  public BasicSummarizer() { }
-  
-  private BasicSummarizer(Configuration conf) {
-    setConf(conf);
-  }
-  
-  
-  /* ----------------------------- *
-   * <implementation:Configurable> *
-   * ----------------------------- */
-  
-  public Configuration getConf() {
-    return conf;
-  }
-  
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.analyzer = new NutchDocumentAnalyzer(conf);
-    this.sumContext = conf.getInt("searcher.summary.context", 5);
-    this.sumLength = conf.getInt("searcher.summary.length", 20);
-  }
-  
-  /* ------------------------------ *
-   * </implementation:Configurable> *
-   * ------------------------------ */
-  
-  
-  /* --------------------------- *
-   * <implementation:Summarizer> *
-   * --------------------------- */
-  
-  public Summary getSummary(String text, Query query) {
-    
-    // Simplistic implementation.  Finds the first fragments in the document
-    // containing any query terms.
-    //
-    // TODO: check that phrases in the query are matched in the fragment
-    
-    Token[] tokens = getTokens(text);             // parse text to token array
-    
-    if (tokens.length == 0)
-      return new Summary();
-    
-    String[] terms = query.getTerms();
-    HashSet highlight = new HashSet();            // put query terms in table
-    for (int i = 0; i < terms.length; i++)
-      highlight.add(terms[i]);
-    
-    // A list to store document's excerpts.
-    // (An excerpt is a Vector full of Fragments and Highlights)
-    List excerpts = new ArrayList();
-    
-    //
-    // Iterate through all terms in the document
-    //
-    int lastExcerptPos = 0;
-    for (int i = 0; i < tokens.length; i++) {
-      //
-      // If we find a term that's in the query...
-      //
-      if (highlight.contains(tokens[i].term())) {
-        //
-        // Start searching at a point SUM_CONTEXT terms back,
-        // and move SUM_CONTEXT terms into the future.
-        //
-        int startToken = (i > sumContext) ? i - sumContext : 0;
-        int endToken = Math.min(i + sumContext, tokens.length);
-        int offset = tokens[startToken].startOffset();
-        int j = startToken;
-        
-        //
-        // Iterate from the start point to the finish, adding
-        // terms all the way.  The end of the passage is always
-        // SUM_CONTEXT beyond the last query-term.
-        //
-        Excerpt excerpt = new Excerpt(i);
-        if (i != 0) {
-          excerpt.add(new Summary.Ellipsis());
-        }
-        
-        //
-        // Iterate through as long as we're before the end of
-        // the document and we haven't hit the max-number-of-items
-        // -in-a-summary.
-        //
-        while ((j < endToken) && (j - startToken < sumLength)) {
-          //
-          // Now grab the hit-element, if present
-          //
-          Token t = tokens[j];
-          if (highlight.contains(t.term())) {
-            excerpt.addToken(t.term());
-            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
-            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
-            offset = t.endOffset();
-            endToken = Math.min(j + sumContext, tokens.length);
-          }
-          
-          j++;
-        }
-        
-        lastExcerptPos = endToken;
-        
-        //
-        // We found the series of search-term hits and added
-        // them (with intervening text) to the excerpt.  Now
-        // we need to add the trailing edge of text.
-        //
-        // So if (j < tokens.length) then there is still trailing
-        // text to add.  (We haven't hit the end of the source doc.)
-        // Add the words since the last hit-term insert.
-        //
-        if (j < tokens.length) {
-          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
-        }
-        
-        //
-        // Remember how many terms are in this excerpt
-        //
-        excerpt.setNumTerms(j - startToken);
-        
-        //
-        // Store the excerpt for later sorting
-        //
-        excerpts.add(excerpt);
-        
-        //
-        // Start SUM_CONTEXT places away.  The next
-        // search for relevant excerpts begins at i-SUM_CONTEXT
-        //
-        i = j + sumContext;
-      }
-    }
-    
-    // Sort the excerpts based on their score
-    Collections.sort(excerpts, SCORE_COMPARATOR);
-    
-    //
-    // If the target text doesn't appear, then we just
-    // excerpt the first SUM_LENGTH words from the document.
-    //
-    if (excerpts.size() == 0) {
-      Excerpt excerpt = new Excerpt(0);
-      int excerptLen = Math.min(sumLength, tokens.length);
-      lastExcerptPos = excerptLen;
-      
-      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
-      excerpt.setNumTerms(excerptLen);
-      excerpts.add(excerpt);
-    }
-    
-    //
-    // Now choose the best items from the excerpt set.
-    // Stop when we have enought excerpts to build our Summary.
-    //
-    double tokenCount = 0;
-    int numExcerpt = excerpts.size()-1;
-    List bestExcerpts = new ArrayList();
-    while (tokenCount <= sumLength && numExcerpt >= 0) {
-      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
-      bestExcerpts.add(excerpt);
-      tokenCount += excerpt.getNumTerms();
-    }    
-    // Sort the best excerpts based on their natural order
-    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
-    
-    //
-    // Now build our Summary from the best the excerpts.
-    //
-    tokenCount = 0;
-    numExcerpt = 0;
-    Summary s = new Summary();
-    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
-      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
-      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
-      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
-        Fragment f = (Fragment) e.nextElement();
-        // Don't add fragments if it takes us over the max-limit
-        if (tokenCount + tokenFraction <= sumLength) {
-          s.add(f);
-        }
-        tokenCount += tokenFraction;
-      }
-    }
-    
-    if (tokenCount > 0 && lastExcerptPos < tokens.length)
-      s.add(new Ellipsis());
-    return s;
-  }
-  
-  /* ---------------------------- *
-   * </implementation:Summarizer> *
-   * ---------------------------- */
-  
-  /** Maximun number of tokens inspect in a summary . */
-  private static final int token_deep = 2000;
-
-  /**
-   * Class Excerpt represents a single passage found in the document, with some
-   * appropriate regions highlit.
-   */
-  class Excerpt {
-    Vector passages = new Vector();
-    SortedSet tokenSet = new TreeSet();
-    int numTerms = 0;
-    int order = 0;
-    
-    /**
-     */
-    public Excerpt(int order) {
-      this.order = order;
-    }
-    
-    /**
-     */
-    public void addToken(String token) {
-      tokenSet.add(token);
-    }
-    
-    /**
-     * Return how many unique toks we have
-     */
-    public int numUniqueTokens() {
-      return tokenSet.size();
-    }
-    
-    /**
-     * How many fragments we have.
-     */
-    public int numFragments() {
-      return passages.size();
-    }
-    
-    public void setNumTerms(int numTerms) {
-      this.numTerms = numTerms;
-    }
-    
-    public int getOrder() {
-      return order;
-    }
-    
-    public int getNumTerms() {
-      return numTerms;
-    }
-    
-    /**
-     * Add a frag to the list.
-     */
-    public void add(Fragment fragment) {
-      passages.add(fragment);
-    }
-    
-    /**
-     * Return an Enum for all the fragments
-     */
-    public Enumeration elements() {
-      return passages.elements();
-    }
-  }
-  
-  
-  private Token[] getTokens(String text) {
-    ArrayList<Token> result = new ArrayList<Token>();
-    TokenStream ts = analyzer.tokenStream("content", new StringReader(text));
-    TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
-    PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
-    TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
-    try {
-      while (result.size() < token_deep && ts.incrementToken()) {
-        final Token token = new Token(
-            termAtt.termBuffer(), 0, termAtt.termLength(), 
-            offsetAtt.startOffset(), offsetAtt.endOffset());
-        token.setType(typeAtt.type());
-        token.setPositionIncrement(posIncrAtt.getPositionIncrement());
-        result.add(token);
-      }
-    } catch (IOException e) {
-      // Ignore (?)
-    }
-
-    try {
-      ts.close();
-    } catch (IOException e) {
-      // ignore
-    }
-    return (Token[]) result.toArray(new Token[result.size()]);
-  }
-  
-  /**
-   * Tests Summary-generation.  User inputs the name of a
-   * text file and a query string
-   */
-  public static void main(String argv[]) throws IOException {
-    // Test arglist
-    if (argv.length < 2) {
-      System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>");
-      return;
-    }
-    
-    Configuration conf = NutchConfiguration.create();
-    Summarizer s = new BasicSummarizer(conf);
-    
-    //
-    // Parse the args
-    //
-    File textFile = new File(argv[0]);
-    StringBuffer queryBuf = new StringBuffer();
-    for (int i = 1; i < argv.length; i++) {
-      queryBuf.append(argv[i]);
-      queryBuf.append(" ");
-    }
-    
-    //
-    // Load the text file into a single string.
-    //
-    StringBuffer body = new StringBuffer();
-    BufferedReader in = new BufferedReader(new FileReader(textFile));
-    try {
-      System.out.println("About to read " + textFile + " from " + in);
-      String str = in.readLine();
-      while (str != null) {
-        body.append(str);
-        str = in.readLine();
-      }
-    } finally {
-      in.close();
-    }
-    
-    // Convert the query string into a proper Query
-    Query query = Query.parse(queryBuf.toString(), conf);
-    System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'");
-  }
-}
Index: src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html
===================================================================
--- src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html	(revision 959954)
+++ src/plugin/summary-basic/src/java/org/apache/nutch/summary/basic/package.html	(working copy)
@@ -1,7 +0,0 @@
-<html>
-<body>
-<p>
-A basic summarizer implementation.
-</p>
-</body>
-</html>
Index: src/plugin/summary-basic/plugin.xml
===================================================================
--- src/plugin/summary-basic/plugin.xml	(revision 959954)
+++ src/plugin/summary-basic/plugin.xml	(working copy)
@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="summary-basic"
-   name="Basic Summarizer Plug-in"
-   version="1.0.0"
-   provider-name="org.apache.nutch">
-
-   <runtime>
-      <library name="summary-basic.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.summary.basic"
-              name="Basic Summarizer"
-              point="org.apache.nutch.searcher.Summarizer">
-
-      <implementation id="Basic Summarizer"
-                      class="org.apache.nutch.summary.basic.BasicSummarizer"/>
-
-   </extension>
-
-</plugin>
Index: src/plugin/summary-basic/build.xml
===================================================================
--- src/plugin/summary-basic/build.xml	(revision 959954)
+++ src/plugin/summary-basic/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="summary-basic" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
===================================================================
--- src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java	(revision 959954)
+++ src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java	(working copy)
@@ -1,239 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.searcher.basic;
-
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.PhraseQuery;
-import org.apache.lucene.search.TermQuery;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.analysis.CommonGrams;
-
-import org.apache.nutch.searcher.QueryFilter;
-import org.apache.nutch.searcher.Query;
-import org.apache.nutch.searcher.Query.*;
-import org.apache.hadoop.conf.Configuration;
-
-/** The default query filter.  Query terms in the default query field are
- * expanded to search the url, anchor and content document fields.
- * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost
- * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses
- * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....).
- * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it.
- **/
-public class BasicQueryFilter implements QueryFilter {
-    
-  private static final int  URL_BOOST       = 0;
-  private static final int  ANCHOR_BOOST    = 1;
-  private static final int  CONTENT_BOOST   = 2;
-  private static final int  TITLE_BOOST     = 3;
-  private static final int  HOST_BOOST      = 4;
-
-  private static int SLOP = Integer.MAX_VALUE;
-
-  private float PHRASE_BOOST;
-
-  private String[] FIELDS =
-  { "url", "anchor", "content", "title", "host" };
-
-  private float[] FIELD_BOOSTS = new float[5];
-
-  /**
-   * Set the boost factor for url matches, relative to content and anchor
-   * matches
-   */
-  public void setUrlBoost(float boost) { FIELD_BOOSTS[URL_BOOST] = boost; }
-
-  /** Set the boost factor for title/anchor matches, relative to url and
-   * content matches. */
-  public void setAnchorBoost(float boost) { FIELD_BOOSTS[ANCHOR_BOOST] = boost; }
-
-  /** Set the boost factor for sloppy phrase matches relative to unordered term
-   * matches. */
-  public void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }
-
-  /** Set the maximum number of terms permitted between matching terms in a
-   * sloppy phrase match. */
-  public void setSlop(int slop) { SLOP = slop; }
-
-  private Configuration conf;
-
-  public BooleanQuery filter(Query input, BooleanQuery output) {
-    addTerms(input, output);
-    addSloppyPhrases(input, output);
-    return output;
-  }
-
-  private void addTerms(Query input, BooleanQuery output) {
-    Clause[] clauses = input.getClauses();
-    for (int i = 0; i < clauses.length; i++) {
-      Clause c = clauses[i];
-
-      if (!c.getField().equals(Clause.DEFAULT_FIELD))
-        continue;                                 // skip non-default fields
-
-      BooleanQuery out = new BooleanQuery();
-      for (int f = 0; f < FIELDS.length; f++) {
-
-        Clause o = c;
-        if (c.isPhrase()) {                         // optimize phrase clauses
-          String[] opt = new CommonGrams(getConf()).optimizePhrase(c.getPhrase(), FIELDS[f]);
-          if (opt.length==1) {
-            o = new Clause(new Term(opt[0]), c.isRequired(), c.isProhibited(), getConf());
-          } else {
-            o = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf());
-          }
-        }
-
-        out.add(o.isPhrase()
-                ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
-                : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
-                BooleanClause.Occur.SHOULD);
-      }
-      output.add(out, (c.isProhibited()
-              ? BooleanClause.Occur.MUST_NOT
-              : (c.isRequired()
-                  ? BooleanClause.Occur.MUST
-                  : BooleanClause.Occur.SHOULD
-                )));
-    }
-  }
-
-  private void addSloppyPhrases(Query input, BooleanQuery output) {
-    Clause[] clauses = input.getClauses();
-    for (int f = 0; f < FIELDS.length; f++) {
-
-      PhraseQuery sloppyPhrase = new PhraseQuery();
-      sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
-      sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
-                           ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
-                           : SLOP);
-      int sloppyTerms = 0;
-
-      for (int i = 0; i < clauses.length; i++) {
-        Clause c = clauses[i];
-        
-        if (!c.getField().equals(Clause.DEFAULT_FIELD))
-          continue;                               // skip non-default fields
-        
-        if (c.isPhrase())                         // skip exact phrases
-          continue;
-
-        if (c.isProhibited())                     // skip prohibited terms
-          continue;
-        
-        sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
-        sloppyTerms++;
-      }
-
-      if (sloppyTerms > 1)
-        output.add(sloppyPhrase, BooleanClause.Occur.SHOULD);
-    }
-  }
-
-
-  private org.apache.lucene.search.Query
-        termQuery(String field, Term term, float boost) {
-    TermQuery result = new TermQuery(luceneTerm(field, term));
-    result.setBoost(boost);
-    return result;
-  }
-
-  /** Utility to construct a Lucene exact phrase query for a Nutch phrase. */
-  private org.apache.lucene.search.Query
-       exactPhrase(Phrase nutchPhrase,
-                   String field, float boost) {
-    Term[] terms = nutchPhrase.getTerms();
-    PhraseQuery exactPhrase = new PhraseQuery();
-    for (int i = 0; i < terms.length; i++) {
-      exactPhrase.add(luceneTerm(field, terms[i]));
-    }
-    exactPhrase.setBoost(boost);
-    return exactPhrase;
-  }
-
-  /** Utility to construct a Lucene Term given a Nutch query term and field. */
-  private static org.apache.lucene.index.Term luceneTerm(String field,
-                                                         Term term) {
-    return new org.apache.lucene.index.Term(field, term.toString());
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.FIELD_BOOSTS[URL_BOOST] = conf.getFloat("query.url.boost", 4.0f);
-    this.FIELD_BOOSTS[ANCHOR_BOOST] = conf.getFloat("query.anchor.boost", 2.0f);
-    this.FIELD_BOOSTS[CONTENT_BOOST] = conf.getFloat("query.content.boost", 1.0f);
-    this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
-    this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
-    this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
-    findAdditionalFields(conf);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-  
-  /** Searches for parameters of the form : query.basic.(fieldname).boost
-   * and adds the fielname to the list of default fields.
-   **/
-  private void findAdditionalFields(Configuration conf) {
-    // get additional fields specified in parameters
-    Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
-    Iterator confEntriesIterator = conf.iterator(); 
-    List existingFields = java.util.Arrays.asList(FIELDS);  
-    ArrayList tempfieldNames = new ArrayList();
-    ArrayList tempfieldBoosts = new ArrayList();
-    while (confEntriesIterator.hasNext()){
-      Map.Entry entry = (Map.Entry) confEntriesIterator.next();
-      String key = entry.getKey().toString();
-      Matcher match = pat.matcher(key);
-      if (!match.matches())continue;
-      String fieldName = match.group(1);
-      if (fieldName!=null){
-        // check whether it matches one of the fields which are used by default
-        if (existingFields.contains(fieldName)) continue;
-        // reserved keyword
-        if (fieldName.equals("phrase")) continue;
-        float boostCustomField = conf.getFloat(key, 2.0f);
-        tempfieldNames.add(fieldName);
-        tempfieldBoosts.add(Float.valueOf(boostCustomField));
-      }
-    }
-    if (tempfieldNames.size()==0) return;
-    // store additional fields names and boost values in corresponding fields
-    String[] tempNames = new String[5+tempfieldNames.size()];
-    float[] tempBoosts = new float[5+tempfieldNames.size()];
-    System.arraycopy(FIELDS, 0,tempNames, 0, 5);
-    System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5);
-    for (int newF=0; newF  < tempfieldNames.size();newF++){
-      tempNames[5+newF]=(String) tempfieldNames.get(newF);
-      tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue();
-    }
-    // replace original fields
-    this.FIELDS = tempNames;
-    this.FIELD_BOOSTS = tempBoosts;
-  }
-}
Index: src/plugin/query-basic/plugin.xml
===================================================================
--- src/plugin/query-basic/plugin.xml	(revision 959954)
+++ src/plugin/query-basic/plugin.xml	(working copy)
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="query-basic"
-   name="Basic Query Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="query-basic.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.searcher.basic"
-              name="Nutch Basic Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="BasicQueryFilter"
-                      class="org.apache.nutch.searcher.basic.BasicQueryFilter">
-        <parameter name="fields" value="DEFAULT"/>
-      </implementation>
-      
-   </extension>
-
-</plugin>
Index: src/plugin/query-basic/build.xml
===================================================================
--- src/plugin/query-basic/build.xml	(revision 959954)
+++ src/plugin/query-basic/build.xml	(working copy)
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="query-basic" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>
Index: src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java
===================================================================
--- src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java	(revision 959954)
+++ src/plugin/analysis-fr/src/java/org/apache/nutch/analysis/fr/FrenchAnalyzer.java	(working copy)
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.analysis.fr;
-
-// JDK imports
-import java.io.Reader;
-
-// Lucene imports
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.util.Version;
-
-// Nutch imports
-import org.apache.nutch.analysis.NutchAnalyzer;
-
-
-/**
- * A simple French Analyzer that wraps the Lucene one.
- * @author Jerome Charron
- */
-public class FrenchAnalyzer extends NutchAnalyzer {
-    
-    private final static Analyzer ANALYZER = 
-            new org.apache.lucene.analysis.fr.FrenchAnalyzer(Version.LUCENE_CURRENT);
-
-    
-    /** Creates a new instance of FrenchAnalyzer */
-    public FrenchAnalyzer() { }
-
-
-    public TokenStream tokenStream(String fieldName, Reader reader) {
-        return ANALYZER.tokenStream(fieldName, reader);
-    }
-
-}
Index: src/plugin/analysis-fr/plugin.xml
===================================================================
--- src/plugin/analysis-fr/plugin.xml	(revision 959954)
+++ src/plugin/analysis-fr/plugin.xml	(working copy)
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="analysis-fr"
-   name="French Analysis Plug-in"
-   version="1.0.0"
-   provider-name="org.apache.nutch">
-
-   <runtime>
-      <library name="analysis-fr.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-lucene-analyzers"/>
-   </requires>
-
-   <extension id="org.apache.nutch.analysis.fr"
-              name="French Analyzer"
-              point="org.apache.nutch.analysis.NutchAnalyzer">
-
-      <implementation id="FrenchAnalyzer"
-                      class="org.apache.nutch.analysis.fr.FrenchAnalyzer">
-        <parameter name="lang" value="fr"/>
-      </implementation>
-
-   </extension>
-
-</plugin>
Index: src/plugin/analysis-fr/build.xml
===================================================================
--- src/plugin/analysis-fr/build.xml	(revision 959954)
+++ src/plugin/analysis-fr/build.xml	(working copy)
@@ -1,34 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="analysis-fr" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-lucene-analyzers"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-lucene-analyzers/*.jar" />
-    </fileset>
-  </path>
-
-</project>
Index: src/plugin/nutch-extensionpoints/plugin.xml
===================================================================
--- src/plugin/nutch-extensionpoints/plugin.xml	(revision 959954)
+++ src/plugin/nutch-extensionpoints/plugin.xml	(working copy)
@@ -25,22 +25,10 @@
    Please not that plugins can define extension points as well to be extendable.-->
 
 <extension-point
-      id="org.apache.nutch.clustering.OnlineClusterer"
-      name="Nutch Online Search Results Clustering Plugin"/>
-
-<extension-point
-      id="org.apache.nutch.indexer.field.FieldFilter"
-      name="Nutch Field Filter"/>
-      
-<extension-point
       id="org.apache.nutch.indexer.IndexingFilter"
       name="Nutch Indexing Filter"/>
 
 <extension-point
-      id="org.apache.nutch.ontology.Ontology"
-      name="Ontology Model Loader"/>
-
-<extension-point
       id="org.apache.nutch.parse.Parser"
       name="Nutch Content Parser"/>
  
@@ -53,10 +41,6 @@
       name="Nutch Protocol"/>
 
 <extension-point
-      id="org.apache.nutch.searcher.QueryFilter"
-      name="Nutch Query Filter"/>
-
-<extension-point
       id="org.apache.nutch.net.URLFilter"
       name="Nutch URL Filter"/>
 
@@ -65,18 +49,6 @@
       name="Nutch URL Normalizer"/>
 
 <extension-point
-      id="org.apache.nutch.analysis.NutchAnalyzer"
-      name="Nutch Analysis"/>
-
-<extension-point
-      id="org.apache.nutch.searcher.response.ResponseWriter"
-      name="Nutch Search Results Response Writer"/>
-      
-<extension-point
-      id="org.apache.nutch.searcher.Summarizer"
-      name="Nutch Summarizer"/>
-
-<extension-point
       id="org.apache.nutch.scoring.ScoringFilter"
       name="Nutch Scoring"/>
 
Index: src/plugin/clustering-carrot2/lib/Jama-1.0.2.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/carrot2-filter-lingo.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/violinstrings-1.0.2.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/carrot2-util-common.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/jama.LICENSE
===================================================================
--- src/plugin/clustering-carrot2/lib/jama.LICENSE	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/jama.LICENSE	(working copy)
@@ -1,6 +0,0 @@
-Copyright Notice This software is a cooperative product of The MathWorks and the 
-National Institute of Standards and Technology (NIST) which has been released to 
-the public domain. Neither The MathWorks nor NIST assumes any responsibility 
-whatsoever for its use by other parties, and makes no guarantees, expressed or 
-implied, about its quality, reliability, or any other characteristic. 
-
Index: src/plugin/clustering-carrot2/lib/violinstrings.LICENSE
===================================================================
--- src/plugin/clustering-carrot2/lib/violinstrings.LICENSE	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/violinstrings.LICENSE	(working copy)
@@ -1,26 +0,0 @@
-Copyright (c) Michael Schmeling 1998, 2000 - All Rights Reserved
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-the rights to use, copy, modify, merge, publish, distribute, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, provided that the above copyright notice(s) and this
-permission notice appear in all copies of the Software and that both the
-above copyright notice(s) and this permission notice appear in supporting
-documentation.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
-BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
-OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
-ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
-SOFTWARE.
-
-Except as contained in this notice, the name of a copyright holder shall
-not be used in advertising or otherwise to promote the sale, use or other
-dealings in this Software without prior written authorization of the
-copyright holder.
Index: src/plugin/clustering-carrot2/lib/commons-pool-1.3.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/carrot2.LICENSE
===================================================================
--- src/plugin/clustering-carrot2/lib/carrot2.LICENSE	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/carrot2.LICENSE	(working copy)
@@ -1,40 +0,0 @@
-
-Carrot2 Project
-
-Copyright (C) 2002-2006, Dawid Weiss, Stanisaw Osiski.
-Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-- Redistributions of  source code must  retain the above  copyright notice, this
-  list of conditions and the following disclaimer. 
-
-- Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following  disclaimer in  the documentation  and/or
-  other materials provided with the distribution. 
-
-- Neither the name  of the Poznan University  of Technology, Poznan, Poland  nor
-  the names  of  its contributors may  be used  to endorse  or promote  products 
-  derived from this software without specific prior written permission.
-
-- We request that  you include in the  end-user documentation provided with  the
-  redistribution and/or in the software itself  an acknowledgement equivalent to
-  the following: "This  product  includes  software  developed  by  the  Carrot2
-  Project."
-  
-- No algorithms or technical solutions in the project may be patented or claimed
-  proprietary.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  AND
-ANY EXPRESS OR  IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED  TO, THE IMPLIED
-WARRANTIES  OF  MERCHANTABILITY  AND  FITNESS  FOR  A  PARTICULAR  PURPOSE   ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE  FOR
-ANY DIRECT, INDIRECT, INCIDENTAL,  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  DAMAGES
-(INCLUDING, BUT  NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE  GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS;  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND  ON
-ANY  THEORY  OF  LIABILITY,  WHETHER  IN  CONTRACT,  STRICT  LIABILITY,  OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY  OUT OF THE USE  OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
Index: src/plugin/clustering-carrot2/lib/snowball.LICENSE
===================================================================
--- src/plugin/clustering-carrot2/lib/snowball.LICENSE	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/snowball.LICENSE	(working copy)
@@ -1,26 +0,0 @@
-
-Copyright (c) 2001, Dr Martin Porter
-(for the Java developments) Copyright (c) 2002, Richard Boulton. 
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, 
-are permitted provided that the following conditions are met:
-
-Redistributions of source code must retain the above copyright notice, this list 
-of conditions and the following disclaimer. Redistributions in binary form must 
-reproduce the above copyright notice, this list of conditions and the following 
-disclaimer in the documentation and/or other materials provided with the 
-distribution. Neither the name of the <ORGANIZATION> nor the names of its 
-contributors may be used to endorse or promote products derived from this 
-software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY 
-THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 
-SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Index: src/plugin/clustering-carrot2/lib/commons-collections-3.2.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/commons-pool.LICENSE
===================================================================
--- src/plugin/clustering-carrot2/lib/commons-pool.LICENSE	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/commons-pool.LICENSE	(working copy)
@@ -1,60 +0,0 @@
-/*
- * $Revision: 206 $
- * $Date: 2004-06-19 18:26:22 +0200 (Sat, 19 Jun 2004) $
- *
- * ====================================================================
- *
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. The end-user documentation included with the redistribution, if
- *    any, must include the following acknowledgement:
- *       "This product includes software developed by the
- *        Apache Software Foundation - http://www.apache.org/"
- *    Alternately, this acknowledgement may appear in the software itself,
- *    if and wherever such third-party acknowledgements normally appear.
- *
- * 4. The names "The Jakarta Project", "Commons", and "Apache Software
- *    Foundation" must not be used to endorse or promote products derived
- *    from this software without prior written permission. For written
- *    permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache"
- *    nor may "Apache" appear in their names without prior written
- *    permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation.  For more
- * information on the Apache Software Foundation, please see
- * http://www.apache.org/
- *
- */
- 
\ No newline at end of file
Index: src/plugin/clustering-carrot2/lib/carrot2-snowball-stemmers.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/carrot2-local-core.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS
===================================================================
--- src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS	(revision 959954)
+++ src/plugin/clustering-carrot2/lib/carrot2.CONTRIBUTORS	(working copy)
@@ -1,15 +0,0 @@
-# 
-# Carrot2 Contributors
-#
-# [*] -- not active.
-#
-# First name, surname name; Duties; Active from; Institution
-
-Dawid Weiss; Project administrator, various components, core; 2002; Poland
-Stanisław, Osiński; Lingo clustering component, ODP Input; 2003; Poland
-Karol Gołembniak, Irmina Masłowska; HAOG clustering component; 2006; Poznan University of Technology; Poland
-
-Michał, Wróblewski [*]; AHC clustering components; 2003; Poznan University of Technology, Poland
-Paweł, Kowalik [*]; Inductive search engine wrapper; 2003; Poznan University of Technology, Poland
-Steven, Schockaert [*]; Fuzzy Ants clustering component; 2004; University of Gent, Belgium
-Lang, Ngo Chi [*]; Fuzzy Rough set clustering component; 2004; Warsaw University, Poland
Index: src/plugin/clustering-carrot2/lib/carrot2-util-tokenizer.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java
===================================================================
--- src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java	(revision 959954)
+++ src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/TestClusterer.java	(working copy)
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.clustering.carrot2;
-
-import java.io.InputStream;
-import java.util.ArrayList;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-
-import junit.framework.TestCase;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.searcher.HitDetails;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-/**
- * A test case for the Carrot2-based clusterer plugin to Nutch.
- */
-public class TestClusterer extends TestCase {
-  private Clusterer c;
-  
-  public TestClusterer(String testName) {
-    super(testName);
-  }
-  
-  protected void setUp() throws Exception {
-    c = new Clusterer();
-    c.setConf(new Configuration());
-  }
-  
-  /**
-   * The clusterer should not fail on empty input, returning
-   * an empty array of {@link HitsCluster}.
-   */
-  public void testEmptyInput() {
-    final HitDetails [] hitDetails = new HitDetails[0];
-    final String [] descriptions = new String [0];
-    final HitsCluster [] clusters = c.clusterHits(hitDetails, descriptions);
-    assertTrue(clusters != null && clusters.length == 0);
-  }
-
-  /**
-   * Tests the clusterer on some cached data.
-   */
-  public void testOnCachedData() throws Exception {
-    final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
-    final DocumentBuilder parser = factory.newDocumentBuilder();
-    final InputStream is = getClass().getResourceAsStream("test-input.xml");
-    assertNotNull("test-input.xml not found", is);
-    final Document document = parser.parse(is);
-    is.close();
-
-    final Element data = document.getDocumentElement();
-    final NodeList docs = data.getElementsByTagName("document");
-    
-    final ArrayList summaries = new ArrayList();
-    final ArrayList hitDetails = new ArrayList();
-
-    assertTrue(docs.getLength() > 0);
-    for (int i = 0; i < docs.getLength(); i++) {
-      final Element doc = (Element) docs.item(i);
-      assertTrue(doc.getNodeType() == Node.ELEMENT_NODE);
-      final Element urlElement = (Element) doc.getElementsByTagName("url").item(0);
-      final Element snippetElement = (Element) doc.getElementsByTagName("snippet").item(0);
-      final Element titleElement = (Element) doc.getElementsByTagName("title").item(0);
-
-      summaries.add(toText(titleElement) + " " + toText(snippetElement));
-      hitDetails.add(new HitDetails(
-          new String [] {"url"}, 
-          new String [] {toText(urlElement)}));
-    }
-
-    HitsCluster [] clusters = c.clusterHits(
-        (HitDetails[]) hitDetails.toArray(new HitDetails[hitDetails.size()]),
-        (String[]) summaries.toArray(new String[summaries.size()]));
-    
-    // There should be SOME clusters in the input... words distribution
-    // should not be random because some words have higher probability.
-    assertTrue(clusters != null);
-    assertTrue("Clusters expected, but not found.", clusters.length > 0);
-
-    // Check hit references inside clusters.
-    for (int i = 0; i < clusters.length; i++) {
-      assertTrue(clusters[i].getHits().length > 0);
-    }
-
-    /*
-    // Dump cluster content if you need to.
-    System.out.println("Clusters: " + clusters.length);
-    for (int i = 0; i < clusters.length; i++) {
-      dump(0, clusters[i]);
-    }
-    */
-  }
-  
-  /**
-   * Converts a {@link Element} to plain text.
-   */
-  private String toText(Element snippetElement) {
-    final StringBuffer buffer = new StringBuffer();
-    final NodeList list = snippetElement.getChildNodes();
-    for (int i = 0; i < list.getLength(); i++) {
-      Node n = list.item(i);
-      if (n.getNodeType() == Node.TEXT_NODE) {
-        buffer.append(n.getNodeValue());
-      } else if (n.getNodeType() == Node.CDATA_SECTION_NODE) {
-        n.getNodeValue();
-      } else throw new RuntimeException("Unexpected nested element when converting to text.");
-    }
-    return buffer.toString();
-  }
-
-  /**
-   * Dumps the content of {@link HitsCluster} to system output stream. 
-   */
-  private void dump(int level, HitsCluster cluster) {
-    String [] labels = cluster.getDescriptionLabels();
-    for (int indent = 0; indent<level; indent++) {
-      System.out.print( "   " );
-    }
-    System.out.print(">> ");
-    if (cluster.isJunkCluster()) System.out.print("(Junk) ");
-    System.out.print("CLUSTER: ");
-    for (int i=0;i<labels.length;i++) {
-      System.out.print( labels[i] + "; " );
-    }
-    System.out.println();
-    
-    HitsCluster [] subclusters = cluster.getSubclusters();
-    if (subclusters != null) {
-      for (int i=0;i<subclusters.length;i++) {
-        dump(level + 1, subclusters[i]);
-      }
-    }
-    
-    // dump documents.
-    HitDetails [] hits = cluster.getHits();
-    if (hits != null) {
-      for (int i=0;i<hits.length;i++ ) {
-        for (int indent = 0; indent<level; indent++) {
-          System.out.print( "   " );
-        }
-        System.out.print( hits[i].getValue("url") );
-        System.out.print( "; " );
-        System.out.println( hits[i].getValue("title") );
-      }
-    }
-  }
-}
Index: src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml
===================================================================
--- src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml	(revision 959954)
+++ src/plugin/clustering-carrot2/src/test/org/apache/nutch/clustering/carrot2/test-input.xml	(working copy)
@@ -1,303 +0,0 @@
-<searchresult>
-<query requested-results="100">data mining</query>
-<document id="0">	<url>http://www.kdnuggets.com/</url>
-	<title>KD Nuggets</title>
-	<snippet>Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings.</snippet>
-</document><document id="1">	<url>http://en.wikipedia.org/wiki/Data_mining</url>
-	<title>Data Mining - Wikipedia</title>
-	<snippet>Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns.</snippet>
-</document><document id="2">	<url>http://www.thearling.com/</url>
-	<title>Thearling.com</title>
-	<snippet>Kurt Thearling&apos;s site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies.</snippet>
-</document><document id="3">	<url>http://www.the-data-mine.com/</url>
-	<title>The Data Mine</title>
-	<snippet>Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining.</snippet>
-</document><document id="4">	<url>http://www.data-miners.com/</url>
-	<title>Data Miners</title>
-	<snippet>Data mining consultancy; services include predictive modeling, consulting, and seminars.</snippet>
-</document><document id="5">	<url>http://www.dmg.org/</url>
-	<title>DMG</title>
-	<snippet>The Laboratory for Advanced Computing develops technologies for high performance computing, high performance networking, internet computing, data mining and related areas. ... Data Mining Group. DMG. DMG Menu ... The Data Mining Group (DMG) is an independent, vendor led group which develops data mining standards, such as the ...</snippet>
-</document><document id="6">	<url>http://www.twocrows.com/glossary.htm</url>
-	<title>Two Crows: Data mining glossary</title>
-	<snippet>Data mining terms concisely defined. ... factor in assessing the success of data mining. When applied to data, accuracy refers to the rate of ... For example, a data mining software system may have an API which ...</snippet>
-</document><document id="7">	<url>http://www.monografias.com/trabajos/datamining/datamining.shtml</url>
-	<title>Data Mining - Monografias.com</title>
-	<snippet>... Data Mining, la extracción de información oculta y predecible de grandes bases de ... de Información (Data Warehouse). Las herramientas de Data Mining predicen futuras tendencias y ...</snippet>
-</document><document id="8">	<url>http://www.ccsu.edu/datamining/resources.html</url>
-	<title>CCSU - Data Mining</title>
-	<snippet>Data Mining Resources. Resources. Groups. Data Sets. Papers on Data Mining. Commercial. Register at</snippet>
-</document><document id="9">	<url>http://www-db.stanford.edu/~ullman/mining/mining.html</url>
-	<title>Jeff Ullman&apos;s Data Mining Lecture Notes</title>
-	<snippet>Offers an introduction to various data mining applications and techniques: association-rule mining, low-support/high correlation, query flocks, searching the Web, web mining, and clustering.</snippet>
-</document><document id="10">	<url>http://www.statsoft.com/textbook/stdatmin.html</url>
-	<title>Electronic Statistics Textbook: Data Mining Techniques</title>
-	<snippet>Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques.</snippet>
-</document><document id="11">	<url>http://www.autonlab.org/tutorials</url>
-	<title>Statistical Data Mining Tutorials</title>
-	<snippet>Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms.</snippet>
-</document><document id="12">	<url>http://www.sas.com/technologies/data_mining</url>
-	<title>SAS | Data and Text Mining</title>
-	<snippet>... of information, the potential would be enormous. With data mining, the possibilities are endless ... almost upon its introduction, our data mining technology continues to receive rave ...</snippet>
-</document><document id="13">	<url>http://www.almaden.ibm.com/cs/quest</url>
-	<title>IBM Research | Almaden Research Center | test</title>
-	<snippet>... Privacy-preserving data mining - preserves privacy at the individual level, while still allowing accurate data mining models at the aggregate level ...</snippet>
-</document><document id="14">	<url>http://www.oracle.com/technology/products/bi/odm/</url>
-	<title>Oracle Data Mining</title>
-	<snippet>... user interface for Oracle Data Mining that helps data analysts mine their Oracle data to find valuable ... With Oracle Data Miner and Oracle Data Mining, the data never leaves the ...</snippet>
-</document><document id="15">	<url>http://www.cs.waikato.ac.nz/~ml/weka/book.html</url>
-	<title>Data Mining: Practical Machine Learning Tools and Techniques</title>
-	<snippet>Data Mining: Practical Machine Learning Tools and Techniques (Second Edition) Morgan Kaufmann. June 2005. 525 pages. Paper. ISBN 0-12-088407-0. Comments ... What&apos;s it all about? 1.1 Data mining and machine learning ...</snippet>
-</document><document id="16">	<url>http://www.ccsu.edu/datamining</url>
-	<title>Data Mining @ CCSU</title>
-	<snippet>Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling.</snippet>
-</document><document id="17">	<url>http://searchcrm.techtarget.com/sDefinition/0,,sid11_gci211901,00.html</url>
-	<title>data mining - a Whatis.com definition - see also: data miner, data analysis</title>
-	<snippet>... whatis.com: searchCRM.com Definitions - data mining ... about the future (This area of data mining is known as predictive analytics.) Data mining techniques are used in ...</snippet>
-</document><document id="18">	<url>http://www.ccsu.edu/datamining/master.html</url>
-	<title>CCSU - Data Mining</title>
-	<snippet>Master of Science Degree. Accredited by the State of Connecticut Department of Higher Education. ... Details on how to apply to the Master of Science in data mining may be found here ... the Master of Science in Data Mining should download the revised Planned Program ...</snippet>
-</document><document id="19">	<url>http://www.statserv.com/datamining.html</url>
-	<title>St@tServ - About Data Mining</title>
-	<snippet>... What is Data Mining ? &quot; Data mining is the process of discovering meaningful new correlations, patterns ... Gartner Group). &quot; Data mining is the exploration and analysis, by automatic ...</snippet>
-</document><document id="20">	<url>http://www.data-mine.com/</url>
-	<title>Data Mining Technologies, Inc.</title>
-	<snippet>Provides software and consulting for data mining.</snippet>
-</document><document id="21">	<url>http://www.the-data-mine.com/bin/view/Misc/DataMiningBooksAndPapers</url>
-	<title>Data Mining - Data Mining Books And Papers</title>
-	<snippet>... Mastering Data Mining Michael J. A. Berry, Gordon S ... method=&quot;POST&quot; action=&quot;http://buybox.amazon.com/o/dt/assoc/handle-buy-box=0471331236&quot;&amp;gt; Data Mining Techniques Michael J ...</snippet>
-</document><document id="22">	<url>http://www.computerworld.com/databasetopics/businessintelligence/datamining</url>
-	<title>Computerworld Data Mining</title>
-	<snippet>This special topic page focuses on data mining software and business intelligence tools. ... Latest on Data Mining. Q&amp;A: CA&apos;s new CTO discusses development, recruiting ... View more on Data Mining. Data Mining Feature. Group files complaint against &apos;adware&apos; firm ...</snippet>
-</document><document id="23">	<url>http://datamining.typepad.com/data_mining/</url>
-	<title>Data Mining</title>
-	<snippet>Current Reading. On the Stack. January 29, 2006. The Strength of BlogAnalytics. A while back, I wrote about how dangerous trend mining over blogs could be in the wrong hands. ... Data Mining. About. Weblogs ... company providing non-trivial analytics over blog data - or any other data for that mater - has already solved this ...</snippet>
-</document><document id="24">	<url>http://www.wessex.ac.uk/conferences/2002/datamining02</url>
-	<title>DATA MINING 2002 - Post Conference Report</title>
-	<snippet>... Third International Conference on Data Mining Methods and Databases for Engineering, Finance and ... The third international conference on Data Mining took place recently in Bologna ...</snippet>
-</document><document id="25">	<url>http://www.thearling.com/text/dmwhite/dmwhite.htm</url>
-	<title>An Introduction to Data Mining</title>
-	<snippet>... Data mining, the extraction of hidden predictive information from large databases, is a ... important information in their data warehouses. Data mining tools predict future trends ...</snippet>
-</document><document id="26">	<url>http://www.spss.com/datamine</url>
-	<title>Data Mining Software, Data Mining Applications and Data Mining Solutions</title>
-	<snippet>Data Mining at SPSS. Your source for data mining software, data mining tools, data mining applications and data mining solutions ... Most analysts separate data mining software into two groups: data mining tools and data mining applications. Data mining tools provide ...</snippet>
-</document><document id="27">	<url>http://www.onlamp.com/pub/a/onlamp/2004/04/08/datamining_email.html</url>
-	<title>ONLamp.com: Data Mining Email</title>
-	<snippet>Robert Bernier demonstrates how to store data from emails into a database, where you can use data-mining techniques to analyze it. ... What is data mining anyway? Data mining is a class of database applications that look for hidden patterns in a group of data ...</snippet>
-</document><document id="28">	<url>http://www.aaai.org/AITopics/html/mining.html</url>
-	<title>Data Mining and Discovery</title>
-	<snippet>AI Topics provides basic, understandable information and helpful resources concerning artificial intelligence, with an emphasis on material available online. ... Data Mining and Discovery. (a subtopic of Machine Learning ... Data mining is an AI powered tool that can discover useful information within a database that can then be used ...</snippet>
-</document><document id="29">	<url>http://www.research.microsoft.com/dmx/</url>
-	<title>Data Management, Exploration and Mining- Home</title>
-	<snippet>The Data Management Exploration and Mining Group (DMX). ... break down with massive data sets. Therefore, we aim at exploiting data mining techniques, i.e ... Our research effort in data mining focuses on ensuring that traditional techniques ...</snippet>
-</document><document id="30">	<url>http://www.dmreview.com/</url>
-	<title>DMReview</title>
-	<snippet>An issues and solutions publication that focuses on data warehousing as well as client/server and object technology for the enterprise.</snippet>
-</document><document id="31">	<url>http://www.megaputer.com/</url>
-	<title>Megaputer Intelligence</title>
-	<snippet>Manufactures multi-strategy data mining and text mining software solutions.</snippet>
-</document><document id="32">	<url>http://databases.about.com/od/datamining</url>
-	<title>Data Mining and Data Warehousing</title>
-	<snippet>The Net&apos;s best collection of data mining and data warehousing links from your About.com guide. From data mining tutorials to data warehousing techniques, you&apos;ll find it all! ... Benefits of Outsourcing Data Warehouse and Data Mining. Many organizations are seeking ...</snippet>
-</document><document id="33">	<url>http://www.pcc.qub.ac.uk/tec/courses/datamining/stu_notes/dm_book_1.html</url>
-	<title>Data Mining Student Notes, QUB</title>
-	<snippet>Data Mining. An Introduction. Student Notes. Ruth Dilly. Parallel Computer Centre. Queens University Belfast. Version 2.0. December1995 ... 1 - Data mining. 1.1 - What is data mining? 1.2 - Data mining background. 1.2.1 - Inductive learning ...</snippet>
-</document><document id="34">	<url>http://itmanagement.webopedia.com/TERM/D/data_mining.html</url>
-	<title>data mining - Webopedia.com</title>
-	<snippet>Search for more IT management terms . . . data mining. A class of database applications that look for hidden patterns in a group of data that can be used to predict future behavior. ... For example, data mining software can help retail companies find customers with common interests ... that presents data in new ways. True data mining software doesn&apos;t just change the ...</snippet>
-</document><document id="35">	<url>http://www.twocrows.com/</url>
-	<title>Two Crows Corporation</title>
-	<snippet>Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use.</snippet>
-</document><document id="36">	<url>http://databases.about.com/library/weekly/aa100700a.htm</url>
-	<title>Data Mining: An Introduction</title>
-	<snippet>Data mining allows you to find the needles hidden in your haystacks of data. Learn how to use these advanced techniques to meet your business objectives. ... heard a good deal about data mining -- the database industry&apos;s latest buzzword ... of automated statistical analysis (or &quot;data mining&quot;) techniques, businesses are discovering new ...</snippet>
-</document><document id="37">	<url>http://www.kdnuggets.com/software</url>
-	<title>Software for Data Mining and Knowledge Discovery</title>
-	<snippet>This is a directory of general-purpose data mining software. To suggest an entry, email to . See also domain-specific data-mining solutions.</snippet>
-</document><document id="38">	<url>http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm</url>
-	<title>Data Mining: What is Data Mining?</title>
-	<snippet>Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works.</snippet>
-</document><document id="39">	<url>http://www.megaputer.com/products/pa/index.php3</url>
-	<title>Data Mining Software</title>
-	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... and versatile suite of advanced data mining tools. PolyAnalyst incorporates the latest ... discovery to analyze both structured and unstructured data. The PolyAnalyst platform offers ...</snippet>
-</document><document id="40">	<url>http://www.sims.berkeley.edu/~hearst/papers/acl99/acl99-tdm.html</url>
-	<title>Untangling Text Data Mining</title>
-	<snippet>... Untangling Text Data Mining. Marti A. Hearst. School of Information Management &amp;amp; Systems ... The possibilities for data mining from large text collections are virtually untapped ...</snippet>
-</document><document id="41">	<url>http://www.megaputer.com/dm/dm101.php3</url>
-	<title>What is Data Mining</title>
-	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... in order to make informed business decisions. Data mining automates the process of finding relationships and patterns in ... In these situations data mining is your only real option ...</snippet>
-</document><document id="42">	<url>http://www.ncbi.nih.gov/Tools</url>
-	<title>NCBI Tools for Bioinformatics Research</title>
-	<snippet>... Tools for Data Mining. PubMed. Entrez. BLAST. OMIM. Books ... results of analyses that have been done on the sequence data. The amount and type of information presented depend ...</snippet>
-</document><document id="43">	<url>http://www.computerworld.com/databasetopics/businessintelligence/story/0,10801,103726,00.html?source=x10</url>
-	<title>Explainer: Data mining - Computerworld</title>
-	<snippet>Often used for predictive modeling, data mining is a subset of business intelligence that can help organizations better understand relationships among variables. ... into usable shape, however, requires sophisticated data mining tools. The same technology that police ... retailers, are ideal candidates for data mining technology. Wal-Mart Stores Inc ...</snippet>
-</document><document id="44">	<url>http://www.dmbenchmarking.com/</url>
-	<title>Data Mining Benchmarking Association (DMBA)</title>
-	<snippet>Association of companies and organizations working to identify &quot;best in class&quot; data mining processes through benchmarking studies.</snippet>
-</document><document id="45">	<url>http://datamining.typepad.com/</url>
-	<title>Data Mining</title>
-	<snippet>Current Reading. On the Stack. January 30, 2006. Fact versus Opinion. Information overload overload is becoming a serious problem for me. ... Data Mining. About. Weblogs ... company providing non-trivial analytics over blog data - or any other data for that mater - has already solved this ...</snippet>
-</document><document id="46">	<url>http://www.wessex.ac.uk/conferences/2005/data05</url>
-	<title>DATA MINING 2005</title>
-	<snippet>... International Conference on Data Mining, Text Mining and their Business Applications ... Conference on Data Mining, Text Mining and their Business Applications (Data Mining ...</snippet>
-</document><document id="47">	<url>http://www.galaxy.gmu.edu/stats/syllabi/DMLIST.html</url>
-	<title>URL&apos;s for Data Mining</title>
-	<snippet>URL&apos;s for Data Mining. The following URL&apos;s are some links to a variety of Data Mining webpages. They are not in any particular order. Actually, they are in the order I discovered (mined) them.</snippet>
-</document><document id="48">	<url>http://www.pcai.com/web/ai_info/data_warehouse_mining.html</url>
-	<title>PC AI - Data Warehouse and Data Mining</title>
-	<snippet>... Data Mining. Overview: Data mining or knowledge discovery is becoming more important as more and ... To Distributed Computing. Data Warehouse and Data Mining Information on the Internet ...</snippet>
-</document><document id="49">	<url>http://www.gr-fx.com/graf-fx.htm</url>
-	<title>Data Mining</title>
-	<snippet>... databases with graphs and queries using a technique called Data Mining. It is also a quick way to ... learn how to use another data mining product. All you have to ...</snippet>
-</document><document id="50">	<url>http://www.dwinfocenter.org/</url>
-	<title>Data Warehousing Information Center</title>
-	<snippet>Provides information on tools and techniques to design, build, maintain, and retrieve information from a data warehouse.</snippet>
-</document><document id="51">	<url>http://www.siam.org/meetings/sdm02</url>
-	<title>SIAM International Conference on Data Mining</title>
-	<snippet>Co-Sponsored by AHPCRC and University of Illinois at Chicago ... The field of data mining draws upon extensive work in areas such as; statistics ... presentation of recent results in data mining, including; applications, algorithms, software, and ...</snippet>
-</document><document id="52">	<url>http://www.oclc.org/research/projects/mining/</url>
-	<title>Data mining [OCLC - Projects]</title>
-	<snippet>Describes the goals, methodology, and timing of the Data mining project. ... Data mining. DCMI Registry DSpace Harvesting Economics of Digital Preservation Electronic Theses and Dissertations ... this end, the OCLC Research Data-Mining Research Area will focus on ...</snippet>
-</document><document id="53">	<url>http://www.stat.rutgers.edu/~madigan/datamining</url>
-	<title>Data Mining</title>
-	<snippet>... DATA MINING SPECIAL TOPICS CLASS ... will be using a draft version of Principles of Data Mining , by Hand, Mannila, and Smyth (MIT Press, forthcoming), as ...</snippet>
-</document><document id="54">	<url>http://dmoz.org/Computers/Software/Databases/Data_Mining</url>
-	<title>Open Directory - Computers: Software: Databases: Data Mining</title>
-	<snippet>the entire directory only in Databases/Data_Mining. See also: ... About.com on Data Mining - About.com presents a collection of original feature articles, net ... room dedicated to data mining and data warehousing topics. The Data Mine - Launched ...</snippet>
-</document><document id="55">	<url>http://www.investorhome.com/mining.htm</url>
-	<title>Investor Home - Data Mining</title>
-	<snippet>... Data Mining. The rapid evolution of computer technology in the last few decades has provided ... and consequences of &quot;data mining.&quot; Data mining involves searching through databases for ...</snippet>
-</document><document id="56">	<url>http://www.sas.com/technologies/analytics/datamining</url>
-	<title>SAS | Data and Text Mining</title>
-	<snippet>... of information, the potential would be enormous. With data mining, the possibilities are endless ... almost upon its introduction, our data mining technology continues to receive rave ...</snippet>
-</document><document id="57">	<url>http://www.wessex.ac.uk/conferences/2003/datamining03</url>
-	<title>Data Mining 2003</title>
-	<snippet>... Data Mining 2003. Fourth International Conference on Data Mining Including Building Applications for CRM ...</snippet>
-</document><document id="58">	<url>http://datamining.itsc.uah.edu/</url>
-	<title>ITSC Data Mining Solutions Center</title>
-	<snippet>... The ITSC Data Mining Solutions Center is the focal point for data mining research, development and services at ...</snippet>
-</document><document id="59">	<url>http://www.webopedia.com/TERM/D/data_mining.html</url>
-	<title>What is data mining? - A Word Definition From the Webopedia Computer Dictionary</title>
-	<snippet>This page describes the term data mining and lists other pages on the Web where you can find additional information. ... For example, data mining software can help retail companies find customers with common interests ... that presents data in new ways. True data mining software doesn&apos;t just change the ...</snippet>
-</document><document id="60">	<url>http://research.microsoft.com/dmx/DataMining/default.aspx</url>
-	<title>Data Mining Project</title>
-	<snippet>Search: All Research OnlineAll Microsoft.com. Data Mining: Efficient Data Exploration and Modeling. Overview. Goal ... will focus on exploiting data mining for advanced data summarization and also enable tighter ... database querying and data mining. Scalable Data Mining Algorithms: We are exploring ...</snippet>
-</document><document id="61">	<url>http://www.fas.org/sgp/crs/intel/RL31798.pdf</url>
-	<title>Data Mining: An Overview</title>
-	<snippet>... assessing risk, and product. retailing, data mining involves the use of data analysis tools to discover ... homeland security, data mining is often viewed as a potential means to ...</snippet>
-</document><document id="62">	<url>http://www.statsoftinc.com/</url>
-	<title>Data Mining, Statistical Analysis, Quality Control - STATISTICA Software</title>
-	<snippet>Statsoft is the creator of STATISTICA, the most comprehensive suite of data mining and statistical analysis software. ... StatSoft logo, STATISTICA, SEWSS, SEDAS, Data Miner, SEPATH and GTrees are trademarks ... more information on STATISTICA, data mining, data analysis, statistical analysis &amp;amp; enterprise ...</snippet>
-</document><document id="63">	<url>http://www.insightful.com/</url>
-	<title>Insightful Corporation</title>
-	<snippet>The developer of the technical calculation application Mathcad, as well as developer and provider of a variety of other software tools for users of PCs, Macintosh computers, and UNIX workstations.</snippet>
-</document><document id="64">	<url>http://www.ncdm.uic.edu/</url>
-	<title>National Center for Data Mining (NCDM) - University of Illinois at Chicago</title>
-	<snippet>Conducts research in: scaling algorithms, applications and systems to massive data sets, developing algorithms, applications, and systems for mining distributed data, and establishing standard languages, protocols, and services for data mining and predictive modeling.</snippet>
-</document><document id="65">	<url>http://www.computerworld.com/hardwaretopics/hardware/desktops/story/0,10801,43509,00.html</url>
-	<title>Data Mining - Computerworld</title>
-	<snippet>Data mining is a process that finds relationships and patterns within a large amount of data stored in a database. The process uses tools based on algorithms to sift through mounds of data to find relationships. ... What has data mining done for Dick&apos;s Supermarkets ... What&apos;s the basis of a good data mining program? You have to establish the integrity of your data because that&apos;s ...</snippet>
-</document><document id="66">	<url>http://www.the-data-mine.com/bin/view/Software/WebHome</url>
-	<title>Data Mining - Web Home (Software)</title>
-	<snippet>... To find Data Mining Software, check the Web Index, use Web Search or check the most recent changes (Web Changes ... Misc. General Data Mining Information - Introductions, Tutorials etc ...</snippet>
-</document><document id="67">	<url>http://www.rulequest.com/</url>
-	<title>Rulequest Research</title>
-	<snippet>Provides software tools for data mining and knowledge discovery in databases.</snippet>
-</document><document id="68">	<url>http://www.bos.frb.org/economic/nerr/rr2000/q3/mining.htm</url>
-	<title>Regional Review: Mining Data</title>
-	<snippet>Mining Data. Quarter 3, 2000. by Miriam Wasserman. SCENE 1: It&apos;s late November 1999. The Celtics are struggling with their second lineup. ... They both include the use of data-mining computer technology to search for patterns in data ... player&apos;s potential is maximized. Although data mining by itself is not going to get ...</snippet>
-</document><document id="69">	<url>http://www.cisl.ucar.edu/hps/GROUPS/dm/dm.html</url>
-	<title>Data Mining Resources</title>
-	<snippet>... and Zantige, D. Data Mining, Harlow, UK: Addison-Wesley, 1996. Berry, M.J.A. and Linoff, G., Data Mining Techniques for Marketing, Sales, and Customer Support, New York, NY: John ...</snippet>
-</document><document id="70">	<url>http://www.wessex.ac.uk/conferences/2004/datamining04</url>
-	<title>DATA MINING 2004</title>
-	<snippet>... Fifth International Conference on Data Mining, Text Mining and their Business Applications ... 5th International Conference on Data Mining, Text Mining and their Business Applications ...</snippet>
-</document><document id="71">	<url>http://www.amazon.com/exec/obidos/tg/detail/-/1558605525?v=glance</url>
-	<title>Amazon.com: Data Mining: Practical Machine Learning Tools and Techniques with Java Implementations (The Morgan ... </title>
-	<snippet>... Topics covered: Data mining and machine learning basics, sample datasets and applications for data mining ... in the synthesis of data mining, data analysis, information theory and ...</snippet>
-</document><document id="72">	<url>http://www.sas.com/technologies/analytics/datamining/miner</url>
-	<title>SAS | SAS Enterprise Miner</title>
-	<snippet>... Miner streamlines the entire data mining process from data access to model deployment by ... It provides a powerful, complete data mining solution with unparalleled model development ...</snippet>
-</document><document id="73">	<url>http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome</url>
-	<title>MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining, Spring 2003 | Home</title>
-	<snippet>... marts specifically intended for management decision support. Data mining is a rapidly growing field that is ... The field of data mining has evolved from the disciplines of statistics ...</snippet>
-</document><document id="74">	<url>http://www.data-mining-guide.net/</url>
-	<title>Data Mining Software | Guide to Data Mining Software &amp; Concepts</title>
-	<snippet>What is Data Mining? Data Mining is the process of analyzing large data sets in order to find patterns that can help to isolate key variables to build predictive models for management decision making. ... In essence, data mining helps businesses to optimize their processes so that ...</snippet>
-</document><document id="75">	<url>http://www.cse.ohio-state.edu/~srini/694Z</url>
-	<title>CIS 694Z: Introduction to Data Mining</title>
-	<snippet>... discovery process, key data mining techniques, efficient high performance mining algorithms, exposure to applications of data mining (bioinformatics and intrusion detection ...</snippet>
-</document><document id="76">	<url>http://www.gao.gov/new.items/d05866.pdf</url>
-	<title>GAO-05-866 Data Mining: Agencies Have Taken Key Steps to Protect Privacy in Selected Efforts, but Significant ... </title>
-	<snippet>... The five data mining efforts we reviewed are used by federal agencies to ... individual privacy rights are being appropriately protected. Data mining—a technique for ...</snippet>
-</document><document id="77">	<url>http://datamining.typepad.com/data_mining/2005/08/rumour_mull.html</url>
-	<title>Data Mining: Rumour Mull</title>
-	<snippet>... Data Mining. About. Weblogs ... for 2005-08-15 from Emergence Marketing. Data Mining: Rumour Mull Interesting analysis of the Technorati takeover rumour ...</snippet>
-</document><document id="78">	<url>http://www.crm2day.com/data_mining</url>
-	<title>CRM Today - Data Mining &amp; Data Warehousing</title>
-	<snippet>... Abstract: The field of data mining, like statistics, concerns itself with ... at the connection between data mining and statistics, and ask ourselves whether data mining is &quot;statistical ...</snippet>
-</document><document id="79">	<url>http://www.kdnuggets.com/meetings</url>
-	<title>Meetings and Conferences in Data Mining and Knowledge Discovery</title>
-	<snippet>Meetings and Conferences in Data Mining, Knowledge Discovery, Genomic Mining, and Web Mining. March 7: Proposals due for. March 7: Proposals due for. 23-24 Oct, M2006, SAS 9th annual Data Mining Technology Conference, Las Vegas, NV, USA. ... with The second workshop on Algorithmic Techniques for Data Mining 2006 (ATDM 2006 ...</snippet>
-</document><document id="80">	<url>http://www.siam.org/meetings/sdm01</url>
-	<title>First SIAM International Conference on Data Mining</title>
-	<snippet>Registration. is Closed. Advances in information technology and data collection methods have led to the availability of large data sets in commercial enterprises and in a wide variety of scientific and engineering disciplines. ... The field of data mining draws upon extensive work in areas such as statistics ... presentation of recent results in data mining, including applications, algorithms, software, and ...</snippet>
-</document><document id="81">	<url>http://crm.ittoolbox.com/topics/t.asp?t=520&amp;p=520&amp;h1=520</url>
-	<title>CRM Analytical Data Mining</title>
-	<snippet>... Quality&apos; Model (Line56)- Learning from the past; data mining and Service Quality provide roadmaps, but CRM ... trade-off analysis. Data Mining in Depth: Data Mining and Privacy (DM ...</snippet>
-</document><document id="82">	<url>http://www.statoo.com/sections/Datamining/</url>
-	<title>Statoo Consulting, Statistical Consulting + Data Analysis + Data Mining Services, Lausanne, Switzerland</title>
-	<snippet>Statoo Consulting is a vendor independent Swiss consulting firm specialized in statistical consulting and training, data analysis, data mining, analytical CRM and bioinformatics services. ... Statistical Consulting + Data Analysis + Data Mining Services. Lausanne, Switzerland. Séminaire de méthodologie en data mining statistique, 6-8 Mars, 2006, Paris, France ...</snippet>
-</document><document id="83">	<url>http://www.cio.com/research/data/</url>
-	<title>Knowledge Management - Data Storage &amp; Mining - Warehouse, OLAP, glossary resources - Knowledge Management RC - CIO</title>
-	<snippet>CIO Data Storage &amp;amp; Mining Research Center is a compilation of articles, case studies, organizations, conferences, glossary of terms, and white papers related to data storage, mining/OLAP, and data warehousing.</snippet>
-</document><document id="84">	<url>http://www.thearling.com/dmintro/dmintro.htm</url>
-	<title>An Introduction to Data Mining by Kurt Thearling</title>
-	<snippet>7-Mar-03: An Introduction to Data Mining</snippet>
-</document><document id="85">	<url>http://www.stayfreemagazine.org/archives/14/datamining.html</url>
-	<title>Data Mining</title>
-	<snippet>... is arguably at the cutting edge of &quot;data mining&quot;: a new kind of information analysis that ... positively timid by comparison. Data mining uses artificial intelligence software to hunt ...</snippet>
-</document><document id="86">	<url>http://www.siam.org/meetings/sdm05</url>
-	<title>SIAM 2005 Data Mining Conference</title>
-	<snippet>... The field of data mining draws upon extensive work in areas ... and high-performance data mining. Distributed data mining. Scalable algorithms. Integration: mining, warehousing and OLAP ...</snippet>
-</document><document id="87">	<url>http://www.jcp.org/en/jsr/detail?id=73</url>
-	<title>The Java Community Process(SM) Program - JSRs: Java Specification Requests - detail JSR# 73</title>
-	<snippet>... and maintain data and metadata supporting data mining models, data scoring, and data mining results serving J2EE ... agreed upon, standard API for data mining. By using JDMAPI ...</snippet>
-</document><document id="88">	<url>http://www.megaputer.com/dm/index.php3</url>
-	<title>Data Mining Introduction</title>
-	<snippet>Megaputer offers data mining, text mining, and web data mining software tools for e-commerce, database marketing, and CRM; seminars, training and consulting on data mining. Customer ... Data Mining. What is data mining? PolyAnalyst Machine Learning Algorithms ... &quot;Data Mining is the process of identifying valid, novel, potentially useful, and ultimately comprehensible ...</snippet>
-</document><document id="89">	<url>http://www.healthcare-informatics.com/issues/2004/04_04/hagland.htm</url>
-	<title>Healthcare Informatics: Data Mining</title>
-	<snippet>... Data Mining. Stronger computer tools allow deeper analysis of medical research, patient care and ... well the tremendous potential of data mining--using software programs for pattern ...</snippet>
-</document><document id="90">	<url>http://www.dmreview.com/article_sub.cfm?articleId=1010449</url>
-	<title>Volume Analytics: Duo-Mining: Combining Data and Text Mining</title>
-	<snippet>... As standalone capabilities, the pattern-finding technologies of data mining and text mining have been around for years ... of all, what are data mining and text mining? They are similar ...</snippet>
-</document><document id="91">	<url>http://www.itworld.com/App/110/050805datamining</url>
-	<title>ITworld.com - Data mining</title>
-	<snippet>... it into usable shape, however, requires sophisticated data mining tools. The same technology that police departments ... How does data mining work? Data mining is a subset of business ...</snippet>
-</document><document id="92">	<url>http://www.statsoft.com/textbook/glosd.html</url>
-	<title>daniell (or equal weight) window. in time series, the daniell ...</title>
-	<snippet>... Data Mining. StatSoft defines data mining as an analytic process designed to ... information, see Data Mining. Data Preparation Phase. In Data Mining, the input data are often &quot;noisy ...</snippet>
-</document><document id="93">	<url>http://oracle.ittoolbox.com/topics/t.asp?t=427&amp;p=427&amp;h1=427</url>
-	<title>Oracle Business Intelligence Data Mining</title>
-	<snippet>... Sub-topic definition: Data Mining is a method of searching data with mathematical algorithms to identify ... the product evaluation process for Data Mining software. Oracle-BI-l - The ...</snippet>
-</document><document id="94">	<url>http://www.time.com/time/globalbusiness/article/0,9171,1101021223-400017,00.html?cnn=yes</url>
-	<title>TIME.com: Data Miners -- Dec. 23, 2002 -- Page 1</title>
-	<snippet>New software instantly connects key bits of data that once eluded teams of researchers ... The data-mining algorithms of ClearForest, based in New York City, are at work within both ... And these days, data-mining software, combined with technologies that connect disparate ...</snippet>
-</document><document id="95">	<url>http://www.sqlserverdatamining.com/</url>
-	<title>SQL Server Data Mining</title>
-	<snippet>sql server | data mining. Happy Birthday to SQLServerDataMining.com! ... .com with the mission to let the world know about the data mining functionality in SQL Server and help them use it ...</snippet>
-</document><document id="96">	<url>http://www.kdd.org/</url>
-	<title>Knowledge Discovery and Data Mining Foundation</title>
-	<snippet>Have you heard about ACM SIGKDD, the newly formed society for knowledge discovery and data mining? Click here to see the brand new ACM SIGKDD web page. KnowledgeDiscovery &amp;amp; Data Mining ... starting point for exploring Internet resources in knowledge discovery and data mining ...</snippet>
-</document><document id="97">	<url>http://www.knightsbridge.com/solutions/client/professional/requirements/mining.php</url>
-	<title>Data Mining</title>
-	<snippet>... Data mining is a powerful data warehousing technology to assist users with the abundance ... that they have collected. Data mining uses sophisticated statistical analyses and modeling ...</snippet>
-</document><document id="98">	<url>http://www.comp.nus.edu.sg/~dm2</url>
-	<title>DM II - Data Mining II</title>
-	<snippet>The DM-II system has two downloadable tools: CBA (v2.1) and IAS. CBA (v2.1) (Last Modify June, 25, 2001) is a data mining tool developed at School of Computing, National University of Singapore. ... Integrating Classification and Association Rule Mining&quot; (KDD-98). Further improvements were made ...</snippet>
-</document><document id="99">	<url>http://www.thearling.com/text/dmviz/modelviz.htm</url>
-	<title>Visualizing Data Mining Models</title>
-	<snippet>... Visualizing Data Mining Models. by Kurt Thearling, Barry Becker, Dennis DeCoste, Bill Mawby ... is going on. Since data mining usually involves extracting &quot;hidden&quot; information from ...</snippet>
-</document></searchresult>
\ No newline at end of file
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java	(revision 959954)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchDocument.java	(working copy)
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.clustering.carrot2;
-
-import org.apache.nutch.searcher.HitDetails;
-import org.carrot2.core.clustering.RawDocument;
-import org.carrot2.core.clustering.RawDocumentBase;
-
-/**
- * An adapter class that implements {@link RawDocument} required for Carrot2.  
- */
-public class NutchDocument extends RawDocumentBase {
-  /**
-   * Integer identifier of this document. We need a subclass of 
-   * {@link java.lang.Object}, so this should do.
-   */
-  private final Integer id;
-  
-  /**
-   * Creates a new document with the given id, <code>summary</code> and wrapping
-   * a <code>details</code> hit details.
-   */
-  public NutchDocument(int id, HitDetails details, String summary, String defaultLanguage) {
-    super(details.getValue("url"), details.getValue("title"), summary);
-
-    // Handle document language -- attempt to extract it from the details,
-    // otherwise set to the default.
-    String lang = details.getValue("lang");
-    if (lang == null) {
-      // No default language. Take the default from the configuration file.
-      lang = defaultLanguage;
-    }
-
-    // Use this language for the snippet. Truncate longer ISO codes
-    // to only include two-letter language code.
-    if (lang.length() > 2) {
-      lang = lang.substring(0, 2);
-    }
-    lang = lang.toLowerCase();    
-    super.setProperty(RawDocument.PROPERTY_LANGUAGE, lang);
-
-    this.id = Integer.valueOf(id);
-  }
-
-  /*
-   * @see com.dawidweiss.carrot.core.local.clustering.RawDocument#getId()
-   */
-  public Object getId() {
-    return id;
-  }
-}
\ No newline at end of file
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java	(revision 959954)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/NutchInputComponent.java	(working copy)
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.clustering.carrot2;
-
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.nutch.searcher.HitDetails;
-import org.carrot2.core.LocalInputComponentBase;
-import org.carrot2.core.ProcessingException;
-import org.carrot2.core.RequestContext;
-import org.carrot2.core.clustering.RawDocumentsConsumer;
-import org.carrot2.core.clustering.RawDocumentsProducer;
-
-/**
- * An input component that ignores the query passed from the
- * controller and instead looks for data stored in the request context.
- * This enables us to reuse the same physical component implementation
- * for data that has already been acquired from Nutch.
- */
-public class NutchInputComponent extends LocalInputComponentBase {
-  public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY
-    = "NUTCH_INPUT_HIT_DETAILS_ARRAY";
-
-  public final static String NUTCH_INPUT_SUMMARIES_ARRAY 
-    = "NUTCH_INPUT_SUMMARIES_ARRAY";
-
-  /** Capabilities required from the next component in the chain */
-  private final static Set SUCCESSOR_CAPABILITIES = toSet(RawDocumentsConsumer.class);
-
-  /** This component's capabilities */
-  private final static Set COMPONENT_CAPABILITIES = toSet(RawDocumentsProducer.class);
-
-  /**
-   * Default language code for hits that don't have their own.
-   */
-  private String defaultLanguage;
-
-  /**
-   * Creates an input component with the given default language code.
-   */
-  public NutchInputComponent(String defaultLanguage) {
-    this.defaultLanguage = defaultLanguage;
-  }
-
-  /*
-   * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String)
-   */
-  public void setQuery(String query) {
-      // ignore the query; data will be provided from the request context.
-  }
-
-  /**
-   * A callback hook that starts the processing.
-   */
-  public void startProcessing(RequestContext context) throws ProcessingException {
-    // let successor components know that the processing has started.
-    super.startProcessing(context);
-    
-    // get the information about documents from the context.
-    final Map params = context.getRequestParameters();
-    final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY);
-    final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY);
-
-    if (details == null)
-      throw new ProcessingException("Details array must not be null.");
-
-    if (summaries == null)
-      throw new ProcessingException("Summaries array must not be null.");
-
-    if (summaries.length != details.length)
-      throw new ProcessingException("Summaries and details must be of the same length.");
-    
-    // produce 'documents' for successor components.
-    final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next;
-    for (int i = 0; i < summaries.length; i++) {
-      consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage));
-    }
-  }
-
-  /**
-   * Returns the capabilities provided by this component.
-   */
-  public Set getComponentCapabilities() {
-    return COMPONENT_CAPABILITIES;
-  }
-    
-  /**
-   * Returns the capabilities required from the successor component.
-   */
-  public Set getRequiredSuccessorCapabilities() {
-    return SUCCESSOR_CAPABILITIES;
-  }
-}
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java	(revision 959954)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/Clusterer.java	(working copy)
@@ -1,330 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.clustering.carrot2;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.clustering.OnlineClusterer;
-import org.apache.nutch.searcher.HitDetails;
-import org.carrot2.core.DuplicatedKeyException;
-import org.carrot2.core.InitializationException;
-import org.carrot2.core.LocalComponent;
-import org.carrot2.core.LocalComponentFactory;
-import org.carrot2.core.LocalControllerBase;
-import org.carrot2.core.LocalProcess;
-import org.carrot2.core.LocalProcessBase;
-import org.carrot2.core.MissingComponentException;
-import org.carrot2.core.MissingProcessException;
-import org.carrot2.core.ProcessingResult;
-import org.carrot2.core.clustering.RawCluster;
-import org.carrot2.core.controller.ControllerHelper;
-import org.carrot2.core.controller.LoaderExtensionUnknownException;
-import org.carrot2.core.impl.ArrayOutputComponent;
-import org.carrot2.core.linguistic.Language;
-import org.carrot2.filter.lingo.local.LingoLocalFilterComponent;
-import org.carrot2.util.tokenizer.languages.AllKnownLanguages;
-
-
-
-/**
- * This plugin provides an implementation of {@link OnlineClusterer} 
- * extension using clustering components of the Carrot2 project
- * (<a href="http://www.carrot2.org">http://www.carrot2.org</a>).
- * 
- * <p>This class hardcodes an equivalent of the following Carrot2 process:
- * <pre><![CDATA[
- * <local-process id="yahoo-lingo">
- *   <name>Yahoo Search API -- Lingo Classic Clusterer</name>
- * 
- *   <input  component-key="input-nutch" />
- *   <filter component-key="filter-lingo" />
- *   <output component-key="output-clustersConsumer" />
- * </local-process>
- * ]]></pre>
- */
-public class Clusterer implements OnlineClusterer, Configurable {
-  /** Default language property name. */
-  private final static String CONF_PROP_DEFAULT_LANGUAGE =
-    "extension.clustering.carrot2.defaultLanguage";
-
-  /** Recognizable languages property name. */
-  private final static String CONF_PROP_LANGUAGES =
-    "extension.clustering.carrot2.languages";
-
-  /** Internal clustering process ID in Carrot2 LocalController */
-  private final static String PROCESS_ID = "nutch-lingo";
-  
-  public static final Log logger = LogFactory.getLog(Clusterer.class);  
-
-  /** The LocalController instance used for clustering */
-  private LocalControllerBase controller;
-
-  /** Nutch configuration. */
-  private Configuration conf;
-
-  /** 
-   * Default language for hits. English by default, but may be changed
-   * via a property in Nutch configuration. 
-   */
-  private String defaultLanguage = "en";
-
-  /** 
-   * A list of recognizable languages..
-   * English only by default, but configurable via Nutch configuration.
-   */
-  private String [] languages = new String [] {defaultLanguage};
-
-  /**
-   * An empty public constructor for making new instances
-   * of the clusterer.
-   */
-  public Clusterer() {
-    // Don't forget to call {@link #setConf(Configuration)}.
-  }
-
-  /**
-   * See {@link OnlineClusterer} for documentation.
-   */
-  public HitsCluster [] clusterHits(HitDetails [] hitDetails, String [] descriptions) {
-    if (this.controller == null) {
-      logger.error("initialize() not called.");
-      return new HitsCluster[0];
-    }
-
-    final Map requestParams = new HashMap();
-    requestParams.put(NutchInputComponent.NUTCH_INPUT_HIT_DETAILS_ARRAY,
-      hitDetails);
-    requestParams.put(NutchInputComponent.NUTCH_INPUT_SUMMARIES_ARRAY,
-      descriptions);
-
-    try {
-      // The input component takes Nutch's results so we don't need the query argument.
-      final ProcessingResult result = 
-        controller.query(PROCESS_ID, "no-query", requestParams);
-
-      final ArrayOutputComponent.Result output =
-        (ArrayOutputComponent.Result) result.getQueryResult();
-
-      final List outputClusters = output.clusters;
-      final HitsCluster [] clusters = new HitsCluster[ outputClusters.size() ];
-
-      int j = 0;
-      for (Iterator i = outputClusters.iterator(); i.hasNext(); j++) {
-        RawCluster rcluster = (RawCluster) i.next();
-        clusters[j] = new HitsClusterAdapter(rcluster, hitDetails);
-      }
-
-      // invoke Carrot2 process here.
-      return clusters;
-    } catch (MissingProcessException e) {
-      throw new RuntimeException("Missing clustering process.", e);
-    } catch (Exception e) {
-      throw new RuntimeException("Unidentified problems with the clustering.", e);
-    }
-  }
-
-  /**
-   * Implementation of {@link Configurable}
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-    // Configure default language and other component settings.
-    if (conf.get(CONF_PROP_DEFAULT_LANGUAGE) != null) {
-      // Change the default language.
-      this.defaultLanguage = conf.get(CONF_PROP_DEFAULT_LANGUAGE);
-    } 
-    if (conf.getStrings(CONF_PROP_LANGUAGES) != null) {
-      this.languages = conf.getStrings(CONF_PROP_LANGUAGES);
-    }
-
-    if (logger.isInfoEnabled()) {
-      logger.info("Default language: " + defaultLanguage);
-      logger.info("Enabled languages: " + Arrays.asList(languages));
-    }
-
-    initialize();
-  }
-
-  /**
-   * Implementation of {@link Configurable}
-   */
-  public Configuration getConf() {
-    return conf;
-  }
-  
-  /**
-   * Initialize clustering processes and Carrot2 components.
-   */
-  private synchronized void initialize() {
-    // Initialize language list, temporarily switching off logging
-    // of warnings. This is a bit of a hack, but we don't want to
-    // redistribute the entire Carrot2 distro and this prevents
-    // nasty ClassNotFound warnings.
-    final Logger c2Logger = Logger.getLogger("org.carrot2");
-    final Level original = c2Logger.getLevel();
-    c2Logger.setLevel(Level.ERROR);
-    AllKnownLanguages.getLanguageCodes();
-    c2Logger.setLevel(original);
-
-    // Initialize the controller.    
-    controller = new LocalControllerBase();
-
-    final Configuration nutchConf = getConf();
-    final String processResource = nutchConf.get(
-        "extension.clustering.carrot2.process-resource");
-
-    if (processResource == null) {
-      logger.info("Using default clustering algorithm (Lingo).");
-      addDefaultProcess();
-    } else {
-      logger.info("Using custom clustering process: " + processResource);
-      controller.setComponentAutoload(true);
-      
-      final ControllerHelper helper = new ControllerHelper();
-      final InputStream is = Thread.currentThread()
-        .getContextClassLoader().getResourceAsStream(processResource);
-      if (is != null) {
-        try {
-          final LocalComponentFactory nutchInputFactory = new LocalComponentFactory() {
-            public LocalComponent getInstance() {
-              return new NutchInputComponent(defaultLanguage);
-            }
-          };
-          controller.addLocalComponentFactory("input-nutch", nutchInputFactory);
-          
-          final LocalProcess process = helper.loadProcess(
-              helper.getExtension(processResource), is).getProcess();
-          controller.addProcess(PROCESS_ID, process);
-          is.close();
-        } catch (IOException e) {
-          logger.error("Could not load process resource: " + processResource, e);
-        } catch (LoaderExtensionUnknownException e) {
-          logger.error("Unrecognized extension of process resource: " + processResource);
-        } catch (InstantiationException e) {
-          logger.error("Could not instantiate process: " + processResource, e);
-        } catch (InitializationException e) {
-          logger.error("Could not initialize process: " + processResource, e);
-        } catch (DuplicatedKeyException e) {
-          logger.error("Duplicated key (unreachable?): " + processResource, e);
-        } catch (MissingComponentException e) {
-          logger.error("Some components are missing, could not initialize process: " 
-              + processResource, e);
-        }
-      } else {
-        logger.error("Could not find process resource: " + processResource);
-      }
-    }
-  }
-
-  /**
-   * Adds a default clustering process using Lingo algorithm.
-   */
-  private void addDefaultProcess() {
-    try {
-      addComponentFactories();
-      addProcesses();
-    } catch (DuplicatedKeyException e) {
-      logger.fatal("Duplicated component or process identifier.", e);
-    }
-  }
-
-  /** Adds the required component factories to a local Carrot2 controller. */
-  private void addComponentFactories() throws DuplicatedKeyException {
-    //  *   <input  component-key="input-nutch" />
-    LocalComponentFactory nutchInputFactory = new LocalComponentFactory() {
-      public LocalComponent getInstance() {
-        return new NutchInputComponent(defaultLanguage);
-      }
-    };
-    controller.addLocalComponentFactory("input-nutch", nutchInputFactory);
-
-    // *   <filter component-key="filter-lingo" />
-    LocalComponentFactory lingoFactory = new LocalComponentFactory() {
-      public LocalComponent getInstance() {
-        final HashMap defaults = new HashMap();
-
-        // These are adjustments settings for the clustering algorithm.
-        // If you try the live WebStart demo of Carrot2 you can see how they affect
-        // the final clustering: http://www.carrot2.org 
-        defaults.put("lsi.threshold.clusterAssignment", "0.150");
-        defaults.put("lsi.threshold.candidateCluster",  "0.775");
-
-        // Initialize a new Lingo clustering component.
-        ArrayList languageList = new ArrayList(languages.length);
-        for (int i = 0; i < languages.length; i++) {
-          final String lcode = languages[i];
-          try {
-            final Language lang = AllKnownLanguages.getLanguageForIsoCode(lcode);
-            if (lang == null) {
-              logger.warn("Language not supported in Carrot2: " + lcode);
-            } else {
-              languageList.add(lang);
-              logger.debug("Language loaded: " + lcode);
-            }
-          } catch (Throwable t) {
-              logger.warn("Language could not be loaded: " + lcode, t);
-          }
-        }
-        return new LingoLocalFilterComponent(
-          (Language []) languageList.toArray(new Language [languageList.size()]), defaults);
-      }
-    };
-    controller.addLocalComponentFactory("filter-lingo", lingoFactory);
-
-    // *   <output component-key="output-clustersConsumer" />
-    LocalComponentFactory clusterConsumerOutputFactory = new LocalComponentFactory() {
-      public LocalComponent getInstance() {
-        return new ArrayOutputComponent();
-      }
-    };
-    controller.addLocalComponentFactory("output-array", 
-      clusterConsumerOutputFactory);
-  }
-
-  /** 
-   * Adds a hardcoded clustering process to the local controller.
-   */  
-  private void addProcesses() {
-    final LocalProcessBase process = new LocalProcessBase(
-        "input-nutch",
-        "output-array",
-        new String [] {"filter-lingo"},
-        "The Lingo clustering algorithm (www.carrot2.org).",
-        "");
-
-    try {
-      controller.addProcess(PROCESS_ID, process);
-    } catch (Exception e) {
-      throw new RuntimeException("Could not assemble clustering process.", e);
-    }
-  }  
-}
Index: src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java
===================================================================
--- src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java	(revision 959954)
+++ src/plugin/clustering-carrot2/src/java/org/apache/nutch/clustering/carrot2/HitsClusterAdapter.java	(working copy)
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.clustering.carrot2;
-
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.nutch.clustering.HitsCluster;
-import org.apache.nutch.searcher.HitDetails;
-import org.carrot2.core.clustering.RawCluster;
-import org.carrot2.core.clustering.RawDocument;
-
-/**
- * An adapter of Carrot2's {@link RawCluster} interface to
- * {@link HitsCluster} interface. 
- */
-public class HitsClusterAdapter implements HitsCluster {
-  private RawCluster rawCluster;
-  private HitDetails [] hits;
-
-  /**
-   * Lazily initialized subclusters array.
-   */
-  private HitsCluster [] subclusters;
-  
-  /**
-   * Lazily initialized documents array.
-   */
-  private HitDetails [] documents;
-  
-  /**
-   * Creates a new adapter.
-   */
-  public HitsClusterAdapter(RawCluster rawCluster, HitDetails [] hits) {
-    this.rawCluster = rawCluster;
-    this.hits = hits;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getSubclusters()
-   */
-  public HitsCluster[] getSubclusters() {
-    if (this.subclusters == null) {
-      final List rawSubclusters = rawCluster.getSubclusters();
-      if (rawSubclusters == null || rawSubclusters.size() == 0) {
-        subclusters = null;
-      } else {
-        subclusters = new HitsCluster[rawSubclusters.size()];
-        int j = 0;
-        for (Iterator i = rawSubclusters.iterator(); i.hasNext(); j++) {
-          RawCluster c = (RawCluster) i.next();
-          subclusters[j] = new HitsClusterAdapter(c, hits);
-        }
-      }
-    }
-
-    return subclusters;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getHits()
-   */
-  public HitDetails[] getHits() {
-    if (documents == null) {
-      List rawDocuments = this.rawCluster.getDocuments();
-      documents = new HitDetails[ rawDocuments.size() ];
-      
-      int j = 0;
-      for (Iterator i = rawDocuments.iterator(); i.hasNext(); j++) {
-        RawDocument doc = (RawDocument) i.next();
-        Integer offset = (Integer) doc.getId();
-        documents[j] = this.hits[offset.intValue()];
-      }
-    }
-
-    return documents;
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#getDescriptionLabels()
-   */
-  public String[] getDescriptionLabels() {
-    List phrases = this.rawCluster.getClusterDescription();
-    return (String []) phrases.toArray( new String [ phrases.size() ]);
-  }
-
-  /*
-   * @see org.apache.nutch.clustering.HitsCluster#isJunkCluster()
-   */
-  public boolean isJunkCluster() {
-    return rawCluster.getProperty(RawCluster.PROPERTY_JUNK_CLUSTER) != null;
-  }
-}
-
Index: src/plugin/clustering-carrot2/readme.txt
===================================================================
--- src/plugin/clustering-carrot2/readme.txt	(revision 959954)
+++ src/plugin/clustering-carrot2/readme.txt	(working copy)
@@ -1,7 +0,0 @@
-This plugin extension adds search results clustering capability to Nutch search 
-frontend.
-
-Carrot2 JARs come from codebase in version: 2.1
-
-See the WIKI for more information about configuration and installation
-of this plugin.
Index: src/plugin/clustering-carrot2/plugin.xml
===================================================================
--- src/plugin/clustering-carrot2/plugin.xml	(revision 959954)
+++ src/plugin/clustering-carrot2/plugin.xml	(working copy)
@@ -1,56 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="clustering-carrot2"
-   name="Online Search Results Clustering using Carrot2's components"
-   version="1.0.3"
-   provider-name="www.carrot2.org">
-
-   <runtime>
-      <library name="clustering-carrot2.jar">
-         <export name="*"/>
-      </library>
-
-      <!--
-	   The defaults for Lingo. If you plan to use another clustering
-	   algorithm from the Carrot2 project, you'll need all the JARs
-	   required for that algorithm.
-	-->
-      <library name="carrot2-filter-lingo.jar"/>
-      <library name="carrot2-local-core.jar"/>
-      <library name="carrot2-snowball-stemmers.jar"/>
-      <library name="carrot2-util-common.jar"/>
-      <library name="carrot2-util-tokenizer.jar"/>
-
-      <library name="commons-collections-3.2.jar"/>
-      <library name="commons-pool-1.3.jar"/>
-      <library name="Jama-1.0.2.jar"/>
-      <library name="violinstrings-1.0.2.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.clustering.carrot2"
-              name="Carrot2 Clusterer"
-              point="org.apache.nutch.clustering.OnlineClusterer">
-      <implementation id="Carrot2"
-                      class="org.apache.nutch.clustering.carrot2.Clusterer"/>
-   </extension>
-</plugin>
Index: src/plugin/clustering-carrot2/build.xml
===================================================================
--- src/plugin/clustering-carrot2/build.xml	(revision 959954)
+++ src/plugin/clustering-carrot2/build.xml	(working copy)
@@ -1,46 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="clustering-carrot2" default="jar-core">
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-nekohtml/*.jar" />
-    </fileset>
-    <fileset dir="${nutch.root}/lib">
-      <include name="commons-lang-*.jar" />
-    </fileset>
-  </path>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-
-	<copy toDir="${build.test}">
-		<fileset dir="${src.test}" excludes="**/*.java" />
-	</copy>
-  </target>
-
-</project>
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java	(revision 959954)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java	(working copy)
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.indexer.Indexer;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.document.Document;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import java.io.*;
-import java.util.Vector;
-
-/** Deletes documents in a set of Lucene indexes that do not have a Creative
- * Commons license. */
-public class CCDeleteUnlicensedTool {
-  private static final Log LOG = LogFactory.getLog(CCDeleteUnlicensedTool.class);
-
-  private IndexReader[] readers;
-
-  /** Constructs a duplicate detector for the provided indexes. */
-  public CCDeleteUnlicensedTool(IndexReader[] readers) {
-    this.readers = readers;
-  }
-
-  /** Closes the indexes, saving changes. */
-  public void close() throws IOException {
-    for (int i = 0; i < readers.length; i++)
-      readers[i].close();
-  }
-
-  /** Delete pages without CC licenes. */
-  public int deleteUnlicensed() throws IOException {
-    int deleteCount = 0;
-    for (int index = 0; index < readers.length; index++) {
-      IndexReader reader = readers[index];
-      int readerMax = reader.maxDoc();
-      for (int doc = 0; doc < readerMax; doc++) {
-        if (!reader.isDeleted(doc)) {
-          Document document = reader.document(doc);
-          if (document.get(CCIndexingFilter.FIELD)==null){ // no CC fields
-            reader.deleteDocument(doc);                    // delete it
-            deleteCount++;
-          }
-        }
-      }
-    }
-    return deleteCount;
-  }
-
-  /** Delete duplicates in the indexes in the named directory. */
-  public static void main(String[] args) throws Exception {
-    String usage = "CCDeleteUnlicensedTool <segmentsDir>";
-
-    if (args.length != 1) {
-      System.err.println("Usage: " + usage);
-      return;
-    } 
-
-    String segmentsDir = args[0];
-
-    File[] directories = new File(segmentsDir).listFiles();
-    Vector vReaders=new Vector();
-    int maxDoc = 0;
-    for (int i = 0; i < directories.length; i++) {
-      File indexDone = new File(directories[i], Indexer.DONE_NAME);
-      if (indexDone.exists() && indexDone.isFile()){
-        File indexDir = new File(directories[i], "index");
-      	IndexReader reader = IndexReader.open(FSDirectory.open(indexDir));
-        maxDoc += reader.maxDoc();
-        vReaders.add(reader);
-      }
-    }
-
-    IndexReader[] readers=new IndexReader[vReaders.size()];
-    for(int i = 0; vReaders.size()>0; i++) {
-      readers[i]=(IndexReader)vReaders.remove(0);
-    }
-
-    CCDeleteUnlicensedTool dd = new CCDeleteUnlicensedTool(readers);
-    int count = dd.deleteUnlicensed();
-    if (LOG.isInfoEnabled()) {
-      LOG.info("CC: deleted "+count+" out of "+maxDoc);
-    }
-    dd.close();
-  }
-}
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java	(revision 959954)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java	(working copy)
@@ -24,7 +24,6 @@
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.lucene.LuceneWriter;
 import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
@@ -111,10 +110,6 @@
     doc.add(FIELD, feature);
   }
 
-  public void addIndexBackendOptions(Configuration conf) {
-    LuceneWriter.addFieldOptions(FIELD, LuceneWriter.STORE.YES, LuceneWriter.INDEX.UNTOKENIZED, conf);
-  }
-
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java	(revision 959954)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java	(working copy)
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Handles "cc:" query clauses, causing them to search the "cc" field indexed by
- * CCIndexingFilter.
- */
-public class CCQueryFilter extends RawFieldQueryFilter {
-  private Configuration conf;
-
-  public CCQueryFilter() {
-    super(CCIndexingFilter.FIELD);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    setBoost(conf.getFloat("query.cc.boost", 0.0f));
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}
Index: src/plugin/creativecommons/plugin.xml
===================================================================
--- src/plugin/creativecommons/plugin.xml	(revision 959954)
+++ src/plugin/creativecommons/plugin.xml	(working copy)
@@ -45,13 +45,4 @@
                       class="org.creativecommons.nutch.CCIndexingFilter"/>
    </extension>
 
-   <extension id="org.creativecommons.nutch.CCQueryFilter"
-              name="Creative Commmons Query Filter"
-              point="org.apache.nutch.searcher.QueryFilter">
-      <implementation id="CCQueryFilter"
-                      class="org.creativecommons.nutch.CCQueryFilter">
-        <parameter name="fields" value="cc"/>
-      </implementation>
-   </extension>
-
 </plugin>
Index: src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
===================================================================
--- src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java	(revision 959954)
+++ src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java	(working copy)
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.field.basic;
-
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.field.FieldFilter;
-import org.apache.nutch.indexer.field.FieldType;
-import org.apache.nutch.indexer.field.FieldWritable;
-
-/**
- * Adds any field of type content to the index.
- */
-public class BasicFieldFilter
-  implements FieldFilter {
-
-  public static final Log LOG = LogFactory.getLog(BasicFieldFilter.class);
-  private Configuration conf;
-  private boolean supplemental = false;
-  private String[] suppFields = null;
-
-  private boolean isSupplementalField(String name) {
-    for (int i = 0; i < suppFields.length; i++) {
-      if (name != null && name.equals(suppFields[i])) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.supplemental = conf.getBoolean("index.supplemental", false);
-    String suppStr = conf.get("index.supplemental.fields", null);
-    if (suppStr != null) {
-      suppFields = suppStr.split(",");
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  public Document filter(String url, Document doc, List<FieldWritable> fields)
-    throws IndexingException {
-
-    // loop through all of the fields
-    for (FieldWritable field : fields) {
-
-      // only grab content fields
-      FieldType type = field.getType();
-      if (type == FieldType.CONTENT) {
-
-        String fieldName = field.getName();
-        
-        // supplemental index will only index certain key fields, allow creating
-        // both a regular and a supplemental index
-        if (!supplemental || (supplemental && isSupplementalField(fieldName))) {
-
-          // create lucene fields from the FieldWritable objects
-          Field.Store store = field.isStored() ? Field.Store.YES
-            : Field.Store.NO;
-          Field.Index indexed = 
-        	  field.isIndexed() 
-        	  	? field.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED 
-        		: Field.Index.NO;
-          Field docField = new Field(fieldName, field.getValue(), store,
-            indexed);
-
-         