Index: conf/generate-hostnormalize.xml.template
===================================================================
--- conf/generate-hostnormalize.xml.template	(revision 0)
+++ conf/generate-hostnormalize.xml.template	(revision 0)
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+<regex-normalize>
+</regex-normalize>
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 396389)
+++ conf/nutch-default.xml	(working copy)
@@ -308,6 +308,13 @@
   fetchlist.  -1 if unlimited.</description>
 </property>
 
+<property>
+  <name>urlnormalizer.regex.partition.file</name>
+  <value>generate-hostnormalize.xml</value>
+  <description>Name of the file on CLASSPATH containing regular expressions
+   used by urlnormalizer-regex plugin</description>
+</property>
+
 <!-- fetcher properties -->
 
 <property>
@@ -564,7 +571,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
+  <value>protocol-http|urlfilter-regex|urlnormalizer-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
Index: src/java/org/apache/nutch/net/UrlNormalizer.java
===================================================================
--- src/java/org/apache/nutch/net/UrlNormalizer.java	(revision 396389)
+++ src/java/org/apache/nutch/net/UrlNormalizer.java	(working copy)
@@ -22,6 +22,7 @@
 
 /** Interface used to convert URLs to normal form and optionally do regex substitutions */
 public interface UrlNormalizer extends Configurable {
+  public final static String X_POINT_ID = UrlNormalizer.class.getName();
   
   /* Interface for URL normalization */
   public String normalize(String urlString) throws MalformedURLException;
Index: src/java/org/apache/nutch/net/URLNormalizers.java
===================================================================
--- src/java/org/apache/nutch/net/URLNormalizers.java	(revision 0)
+++ src/java/org/apache/nutch/net/URLNormalizers.java	(revision 0)
@@ -0,0 +1,169 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+
+public final class URLNormalizers {
+
+  public static final Logger LOG =
+      LogFormatter.getLogger(URLNormalizers.class.getName());
+  
+  /** Empty extension list for caching purposes. */
+  private final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;
+  
+  private Configuration conf;
+  private ExtensionPoint extensionPoint;
+  private UrlNormalizer[] normalizers;
+
+  /** Wildcard for default plugins. */
+  public static final String DEFAULT_PLUGIN = "*";
+	
+  public URLNormalizers(Configuration conf, String scope) {
+	    this.conf = conf;
+	    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+	        UrlNormalizer.X_POINT_ID);
+
+	    if (this.extensionPoint == null) {
+	      throw new RuntimeException("x point " + UrlNormalizer.X_POINT_ID + " not found.");
+	    }
+	    
+	    normalizers = getURLNormalizers(scope);
+  }                      
+
+  /**
+   * Function returns an array of {@link UrlNormalizer}s for a given scope
+   *
+   * @param scope The scope to return the <code>Array</code>
+   *                    of {@link UrlNormalizer}s for.
+   * @param url The url for the content that may allow us to get the type from
+   *            the file suffix.
+   * @return An <code>Array</code> of {@link UrlNormalizer}s for the given scope.
+   * @throws PluginRuntimeException 
+   */
+  public UrlNormalizer[] getURLNormalizers(String scope) {
+    List extensions = getExtensions(scope);
+	List normalizers = new Vector(extensions.size());
+    
+    Iterator it = extensions.iterator();
+    while (it.hasNext()) {
+    	Extension ext = (Extension) it.next();
+    	UrlNormalizer normalizer = null;
+	    try {
+	        //check to see if we've cached this parser instance yet
+	    	normalizer = (UrlNormalizer) this.conf.getObject(ext.getId());
+	        if (normalizer == null) {
+	          // go ahead and instantiate it and then cache it
+	          normalizer = (UrlNormalizer)ext.getExtensionInstance();	
+	          this.conf.setObject(ext.getId(), normalizer);
+	        }
+	        normalizers.add(normalizer);
+	      } catch (PluginRuntimeException e) {
+	          e.printStackTrace();
+	        LOG.warning("UrlNormalizers:PluginRuntimeException when "
+	                  + "initializing url normalizer plugin "
+	                  + ext.getDescriptor().getPluginId()
+	                  + " instance in getURLNormalizers "
+	                  + "function: attempting to continue instantiating parsers");
+	      }
+    }
+    return (UrlNormalizer[]) normalizers.toArray(new UrlNormalizer[normalizers.size()]);
+  }
+    
+  
+  /**
+   * Finds the best-suited normalizer plugin for a given scope.
+   * 
+   * @param scope Scope for which we seek a normalizer plugin.
+   * @return a list of extensions to be used for this scope.
+   *         If none, returns empty list.
+   * @throws PluginRuntimeException 
+   */
+  private List getExtensions(String scope) {
+    
+    List extensions = (List) this.conf.getObject(scope);
+
+    // Just compare the reference:
+    // if this is the empty list, we know we will find no extension.
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return EMPTY_EXTENSION_LIST;
+    }
+    
+    if (extensions == null) {
+      extensions = findExtensions(scope);
+      if (extensions != null) {
+        this.conf.setObject(scope, extensions);
+      } else {
+      	// Put the empty extension list into cache
+      	// to remember we don't know any related extension.
+      	this.conf.setObject(scope, EMPTY_EXTENSION_LIST);
+      }
+    }
+    return extensions;
+  }
+  
+  /**
+   * searches a list of suitable url normalizer plugins for the given scope.
+   * 
+   * @param scope Scope for which we seek a url normalizer plugin.
+   * @return List - List of extensions to be used for this scope.
+   *                If none, returns null.
+   * @throws PluginRuntimeException 
+   */
+  private List findExtensions(String scope) {
+    
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    List normalizerExtensions = new ArrayList();
+    for (int i = 0; i < extensions.length; i++) {
+        Extension extension = extensions[i];
+        if (scope.equals(extension.getAttribute("scope")) || scope.equals(DEFAULT_PLUGIN)) {
+        	normalizerExtensions.add(extension);
+        }
+    }
+    return normalizerExtensions;
+  }
+
+	public UrlNormalizer[] getNormalizers() {
+		return normalizers;
+	}
+	
+	public void setNormalizers(UrlNormalizer[] normalizers) {
+		this.normalizers = normalizers;
+	}
+	
+	public String normalize(String urlString) throws MalformedURLException {
+	    for (int i = 0; i < this.normalizers.length; i++) {
+	        if (urlString == null)
+	          return null;
+	        urlString = this.normalizers[i].normalize(urlString);
+	    }
+	    return urlString;
+	}
+}
Index: src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
===================================================================
--- src/java/org/apache/nutch/crawl/PartitionUrlByHost.java	(revision 396389)
+++ src/java/org/apache/nutch/crawl/PartitionUrlByHost.java	(working copy)
@@ -16,18 +16,26 @@
 
 package org.apache.nutch.crawl;
 
+import java.net.MalformedURLException;
 import java.net.URL;
-import java.net.MalformedURLException;
-
+import java.util.logging.Logger;
+ 
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.URLNormalizers;
 
 /** Partition urls by hostname. */
 public class PartitionUrlByHost implements Partitioner {
   private int seed;
+  private URLNormalizers normalizers;
+  public static final String PARTITION_SCOPE = "partition";
+  public static final Logger LOG =
+      LogFormatter.getLogger("org.apache.nutch.crawl.PartitionUrlByHost");
 
   public void configure(JobConf job) {
     seed = job.getInt("partition.url.by.host.seed", 0);
+    normalizers = new URLNormalizers(job, PARTITION_SCOPE);
   }
   
   public void close() {}
@@ -36,10 +44,19 @@
   public int getPartition(WritableComparable key, Writable value,
                           int numReduceTasks) {
     String urlString = ((UTF8)key).toString();
+    if (normalizers.getNormalizers() != null) {
+      try {
+        urlString = normalizers.normalize(urlString);
+      } catch (MalformedURLException e) {
+        LOG.warning("Malformed URL Exception during PartitionURLByHost: " + e.getMessage());
+      }
+    }
+
     URL url = null;
     try {
       url = new URL(urlString);
     } catch (MalformedURLException e) {
+      LOG.warning("Malformed URL Exception during PartitionURLByHost: " + e.getMessage());
     }
     int hashCode = (url==null ? urlString : url.getHost()).hashCode();
 
@@ -48,7 +65,4 @@
 
     return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
   }
-  
 }
-
-
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 396389)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -30,6 +30,7 @@
 
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
@@ -68,6 +69,7 @@
     private int maxPerHost;
     private Partitioner hostPartitioner = new PartitionUrlByHost();
     private URLFilters filters;
+    private URLNormalizers normalizers;
     private SelectorEntry entry = new SelectorEntry();
     private FloatWritable sortValue = new FloatWritable();
 
@@ -76,6 +78,8 @@
       limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
       maxPerHost = job.getInt("generate.max.per.host", -1);
       filters = new URLFilters(job);
+      hostPartitioner.configure(job);
+      normalizers = new URLNormalizers(job, PartitionUrlByHost.PARTITION_SCOPE);
     }
 
     public void close() {}
@@ -125,9 +129,16 @@
         UTF8 url = entry.url;
 
         if (maxPerHost > 0) {                     // are we counting hosts?
-          String host = new URL(url.toString()).getHost();
+          String host = url.toString();
+          String oldHost = new URL(host).getHost();
+          try {
+            host = normalizers.normalize(host);
+          } catch (MalformedURLException e) {
+            LOG.warning("Malformed URL Exception during PartitionURLByHost: " + e.getMessage());
+          }	
+        	
+          host = new URL(host).getHost();
           Integer hostCount = (Integer)hostCounts.get(host);
-
           // increment hostCount
           hostCount = new Integer(hostCount==null ? 1 : hostCount.intValue()+1);
           hostCounts.put(host, hostCount);
@@ -148,9 +159,7 @@
         // maxPerHost may cause us to skip it.
         count++;
       }
-
     }
-
   }
 
   public static class SelectorInverseMapper extends MapReduceBase implements Mapper {
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 396389)
+++ src/plugin/build.xml	(working copy)
@@ -24,6 +24,7 @@
      <ant dir="lib-nekohtml" target="deploy"/>
      <ant dir="lib-parsems" target="deploy"/>
      <ant dir="lib-regex-filter" target="deploy"/>
+     <ant dir="lib-regex-normalizer" target="deploy"/>
      <ant dir="lib-xml" target="deploy"/>
      <ant dir="microformats-reltag" target="deploy"/>
      <ant dir="nutch-extensionpoints" target="deploy"/>
@@ -52,6 +53,7 @@
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
+     <ant dir="urlnormalizer-regex" target="deploy"/>  	
   </target>
 
   <!-- ====================================================== -->
@@ -76,6 +78,7 @@
      <ant dir="parse-zip" target="test"/>
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-regex" target="test"/>
+     <ant dir="urlnormalizer-regex" target="test"/>    	
     </parallel>
   </target>
 
@@ -98,6 +101,7 @@
     <ant dir="lib-nekohtml" target="clean"/>
     <ant dir="lib-parsems" target="clean"/>
     <ant dir="lib-regex-filter" target="clean"/>
+    <ant dir="lib-regex-normalizer" target="clean"/>
     <ant dir="lib-xml" target="clean"/>
     <ant dir="microformats-reltag" target="clean"/>
     <ant dir="nutch-extensionpoints" target="clean"/>
@@ -126,6 +130,7 @@
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
+    <ant dir="urlnormalizer-regex" target="clean"/>  	
   </target>
 
 </project>
Index: src/plugin/urlnormalizer-regex/sample/regex-url-normalize.xml
===================================================================
--- src/plugin/urlnormalizer-regex/sample/regex-url-normalize.xml	(revision 0)
+++ src/plugin/urlnormalizer-regex/sample/regex-url-normalize.xml	(revision 0)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!-- The following rules show how to strip out session IDs 
+     that are 32 characters long and have the parameter 
+     name of PHPSESSID. Order does matter!  -->
+<regex-normalize>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}$</pattern>
+  <substitution></substitution>
+</regex>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}(\&amp;|\&amp;amp;)(.*)</pattern>
+  <substitution>$1$3</substitution>
+</regex>
+</regex-normalize>
Index: src/plugin/urlnormalizer-regex/sample/regex-url-normalize.urls
===================================================================
--- src/plugin/urlnormalizer-regex/sample/regex-url-normalize.urls	(revision 0)
+++ src/plugin/urlnormalizer-regex/sample/regex-url-normalize.urls	(revision 0)
@@ -0,0 +1 @@
+http://www.hostip.info/content?PHPSESSID=10001232100012321000123210001232 http://www.hostip.info/content
\ No newline at end of file
Index: src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/urlnormalizer/regex/partition/TestRegexPartitionURLNormalizer.java
===================================================================
--- src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/urlnormalizer/regex/partition/TestRegexPartitionURLNormalizer.java	(revision 0)
+++ src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/urlnormalizer/regex/partition/TestRegexPartitionURLNormalizer.java	(revision 0)
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlnormalizer.regex.partition;
+
+// JDK imports
+import java.io.IOException;
+
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+import org.apache.nutch.net.UrlNormalizer;
+import org.apache.nutch.urlnormalizer.api.RegexURLNormalizerBaseTest;
+import org.apache.oro.text.regex.MalformedPatternException;
+
+
+/**
+ * JUnit based test of class <code>RegexURLNormalizer</code>.
+ *
+ */
+public class TestRegexPartitionURLNormalizer extends RegexURLNormalizerBaseTest{
+  
+  public TestRegexPartitionURLNormalizer(String testName) {
+    super(testName);
+  }
+  
+  public static Test suite() {
+    return new TestSuite(TestRegexPartitionURLNormalizer.class);
+  }
+  
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  protected UrlNormalizer getURLNormalizer(String rulesFileName) throws MalformedPatternException {
+    try {
+      return new RegexPartitionURLNormalizer(rulesFileName);
+    } catch (IOException e) {
+      fail(e.toString());
+      return null;
+    }
+  }
+  
+  public void test() {
+    test("regex-url-normalize");
+  }
+}
Index: src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/urlnormalizer/regex/partition/RegexPartitionURLNormalizer.java
===================================================================
--- src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/urlnormalizer/regex/partition/RegexPartitionURLNormalizer.java	(revision 0)
+++ src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/urlnormalizer/regex/partition/RegexPartitionURLNormalizer.java	(revision 0)
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.urlnormalizer.regex.partition;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlnormalizer.api.RegexURLNormalizerBase;
+import org.apache.oro.text.regex.MalformedPatternException;
+
+/** Allows users to do regex substitutions on all/any URLs that are encountered during partitioning.
+ *
+ * <p>The <tt>urlnormalizer.regex.partition.file</tt>
+ * property should also be set to the file name of an xml file which should contain the patterns
+ * and substitutions to be done on encountered URLs.</p>
+ *
+ */
+public class RegexPartitionURLNormalizer extends RegexURLNormalizerBase {
+
+    /** The default constructor which is called from UrlNormalizerFactory (normalizerClass.newInstance()) in method: getNormalizer()**/
+    public RegexPartitionURLNormalizer()  {
+    	super();
+    }
+    
+    /** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */
+    public RegexPartitionURLNormalizer(String filename)throws IOException, MalformedPatternException {
+    	super(filename);
+    }
+    
+    
+    protected String getRulesFileName(Configuration conf) {
+    	return conf.get("urlnormalizer.regex.partition.file");
+    }
+}
\ No newline at end of file
Index: src/plugin/urlnormalizer-regex/plugin.xml
===================================================================
--- src/plugin/urlnormalizer-regex/plugin.xml	(revision 0)
+++ src/plugin/urlnormalizer-regex/plugin.xml	(revision 0)
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="urlnormalizer-regex"
+   name="Regex URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-normalizer"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer"
+              name="Nutch Regex URL Normalizer"
+              point="org.apache.nutch.net.UrlNormalizer">
+      <implementation id="RegexPartitionURLNormalizer"
+                      class="org.apache.nutch.urlnormalizer.regex.partition.RegexPartitionURLNormalizer">
+        <parameter name="scope" value="partition"/>
+      </implementation>
+   </extension>
+
+</plugin>
Index: src/plugin/urlnormalizer-regex/build.xml
===================================================================
--- src/plugin/urlnormalizer-regex/build.xml	(revision 0)
+++ src/plugin/urlnormalizer-regex/build.xml	(revision 0)
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<project name="urlnormalizer-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-normalizer"/>
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-normalizer"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-normalizer/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-normalizer/test"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-normalizer"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.xml, **/*.urls"/>
+  </copy>
+
+</project>
Index: src/plugin/nutch-extensionpoints/plugin.xml
===================================================================
--- src/plugin/nutch-extensionpoints/plugin.xml	(revision 396389)
+++ src/plugin/nutch-extensionpoints/plugin.xml	(working copy)
@@ -45,4 +45,8 @@
       id="org.apache.nutch.analysis.NutchAnalyzer"
       name="Nutch Analysis"/>
 
+<extension-point
+      id="org.apache.nutch.net.UrlNormalizer"
+      name="Nutch URL Normalizer"/>
+
 </plugin>
Index: src/plugin/lib-regex-normalizer/src/test/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBaseTest.java
===================================================================
--- src/plugin/lib-regex-normalizer/src/test/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBaseTest.java	(revision 0)
+++ src/plugin/lib-regex-normalizer/src/test/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBaseTest.java	(revision 0)
@@ -0,0 +1,133 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlnormalizer.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Hadoop imports
+import org.apache.hadoop.util.LogFormatter;
+
+// Nutch imports
+import org.apache.nutch.net.UrlNormalizer;import org.apache.oro.text.regex.MalformedPatternException;
+;
+
+
+/**
+ * JUnit based test of class <code>RegexURLNormalizerBase</code>.
+ *
+ */
+public abstract class RegexURLNormalizerBaseTest extends TestCase {
+  
+  /** My logger */
+  protected static final Logger LOG =
+    LogFormatter.getLogger(RegexURLNormalizerBaseTest.class.getName());  
+
+  private final static String SEPARATOR = System.getProperty("file.separator");  
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+  
+  public RegexURLNormalizerBaseTest(String testName) {
+    super(testName);
+  }
+  
+  protected abstract UrlNormalizer getURLNormalizer(String filename) throws MalformedPatternException;
+
+  protected void bench(int loops, String file) {
+    try {
+      bench(loops,
+            SAMPLES + SEPARATOR + file + ".xml",
+            new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void bench(int loops, String rulesFileName, Reader urls) {
+    long start = System.currentTimeMillis();
+    try {
+      UrlNormalizer normalizer = getURLNormalizer(rulesFileName);
+      NormalizedURL[] expected = readURLFile(urls);
+      for (int i=0; i<loops; i++) {
+        test(normalizer, expected);
+      }
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") " +
+             (System.currentTimeMillis()-start) + "ms");
+  }
+  
+  protected void test(String file) {
+    try {
+      test(SAMPLES + SEPARATOR + file + ".xml",
+           new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(String rulesFileName, Reader urls) {
+    try {
+      test(getURLNormalizer(rulesFileName), readURLFile(urls));
+    } catch (Exception e) {
+      fail(e.toString());
+    }
+  }
+  
+  protected void test(UrlNormalizer normalizer, NormalizedURL[] urls) throws MalformedURLException {
+    for (int i=0; i<urls.length; i++) {
+      String result = normalizer.normalize(urls[i].url);
+      assertEquals(urls[i].expectedURL, result);
+    }
+  }
+  
+  private static NormalizedURL[] readURLFile(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    List list = new ArrayList();
+    String line;
+    while((line=in.readLine()) != null) {
+      if (line.length() != 0) {
+        list.add(new NormalizedURL(line));
+      }
+    }
+    return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);
+  }  
+  
+  /**
+   * Class holding the URL to be normalized and the expected URL, assuming
+   * they are separrated by space
+   * 
+   * */
+  private static class NormalizedURL {
+    String url;
+    String expectedURL;
+
+    NormalizedURL(String line) {
+    	url = line.substring(0, line.indexOf(' ')).trim();
+    	expectedURL = line.substring(line.indexOf(' ')).trim();
+    }
+  }
+}
Index: src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerRule.java
===================================================================
--- src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerRule.java	(revision 0)
+++ src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerRule.java	(revision 0)
@@ -0,0 +1,10 @@
+package org.apache.nutch.urlnormalizer.api;
+
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.Perl5Substitution;
+
+public class RegexURLNormalizerRule {
+    public Perl5Pattern pattern;
+    public Perl5Substitution substitution;
+
+}
Index: src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBase.java
===================================================================
--- src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBase.java	(revision 0)
+++ src/plugin/lib-regex-normalizer/src/java/org/apache/nutch/urlnormalizer/api/RegexURLNormalizerBase.java	(revision 0)
@@ -0,0 +1,277 @@
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.urlnormalizer.api;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Logger;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.net.UrlNormalizer;
+import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.Perl5Substitution;
+import org.apache.oro.text.regex.Util;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+/** Converts URLs to a normal form . */
+public abstract class RegexURLNormalizerBase implements UrlNormalizer {
+    public static final Logger LOG =
+            LogFormatter.getLogger("org.apache.nutch.urlnormalizer.RegexURLNormalizerBase");
+
+    private Perl5Compiler compiler = new Perl5Compiler();
+    private List rules;
+    private PatternMatcher matcher = new Perl5Matcher();
+    
+    private ThreadLocal matchers = new ThreadLocal() {
+        protected synchronized Object initialValue() {
+          return new Perl5Matcher();
+        }
+      };
+    private RegexURLNormalizerRule relativePathRule = null;
+    private RegexURLNormalizerRule leadingRelativePathRule = null;
+
+    private Configuration conf;
+
+    public RegexURLNormalizerBase() {
+      try {
+        // this pattern tries to find spots like "/xx/../" in the url, which
+        // could be replaced by "/" xx consists of chars, different then "/"
+        // (slash) and needs to have at least one char different from "."
+        relativePathRule = new RegexURLNormalizerRule();
+        relativePathRule.pattern = (Perl5Pattern)
+          compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK);
+        relativePathRule.substitution = new Perl5Substitution("/");
+
+        // this pattern tries to find spots like leading "/../" in the url,
+        // which could be replaced by "/"
+        leadingRelativePathRule = new RegexURLNormalizerRule();
+        leadingRelativePathRule.pattern = (Perl5Pattern)
+          compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
+        leadingRelativePathRule.substitution = new Perl5Substitution("/");
+
+      } catch (MalformedPatternException e) {
+        e.printStackTrace();
+        throw new RuntimeException(e);
+      }
+    }
+
+    public RegexURLNormalizerBase(String filename) throws IOException, MalformedPatternException {
+    	this();
+	    rules = readConfigurationFile(filename);
+    }
+
+    private String basicNormalize(String urlString) throws MalformedURLException {
+        if ("".equals(urlString))                     // permit empty
+            return urlString;
+        urlString = urlString.trim();                 // remove extra spaces
+        URL url = new URL(urlString);
+        String protocol = url.getProtocol();
+        String host = url.getHost();
+        int port = url.getPort();
+        String file = url.getFile();
+        boolean changed = false;
+
+        if (!urlString.startsWith(protocol)) {       // protocol was lowercased
+            changed = true;
+        }
+        if ("http".equals(protocol) || "ftp".equals(protocol)) {
+            if (host != null) {
+                String newHost = host.toLowerCase();    // lowercase host
+                if (!host.equals(newHost)) {
+                    host = newHost;
+                    changed = true;
+                }
+            }
+            if (port == url.getDefaultPort()) {       // uses default port
+                port = -1;                              // so don't specify it
+                changed = true;
+            }
+            if (file == null || "".equals(file)) {    // add a slash
+                file = "/";
+                changed = true;
+            }
+            if (url.getRef() != null) {                 // remove the ref
+                changed = true;
+            }
+            // check for unnecessary use of "/../"
+            String file2 = substituteUnnecessaryRelativePaths(file);
+            if (!file.equals(file2)) {
+                changed = true;
+                file = file2;
+            }
+        }
+        if (changed) {
+        	urlString = new URL(protocol, host, port, file).toString();
+        }
+        return urlString;
+    }
+
+    private String substituteUnnecessaryRelativePaths(String file) {
+        String fileWorkCopy = file;
+        int oldLen = file.length();
+        int newLen = oldLen - 1;
+
+        // All substitutions will be done step by step, to ensure that certain
+        // constellations will be normalized, too
+        //
+        // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
+        // following manner:
+        //   "/aa/bb/../../cc/../foo.html"
+        //   "/aa/../cc/../foo.html"
+        //   "/cc/../foo.html"
+        //   "/foo.html"
+        //
+        // The normalization also takes care of leading "/../", which will be
+        // replaced by "/", because this is a rather a sign of bad webserver
+        // configuration than of a wanted link.  For example, urls like
+        // "http://www.foo.com/../" should return a http 404 error instead of
+        // redirecting to "http://www.foo.com".
+        //
+        Perl5Matcher matcher = (Perl5Matcher)matchers.get();
+        while (oldLen != newLen) {
+            // substitue first occurence of "/xx/../" by "/"
+            oldLen = fileWorkCopy.length();
+            fileWorkCopy = Util.substitute
+              (matcher, relativePathRule.pattern,
+               relativePathRule.substitution, fileWorkCopy, 1);
+            // remove leading "/../"
+            fileWorkCopy = Util.substitute
+              (matcher, leadingRelativePathRule.pattern,
+               leadingRelativePathRule.substitution, fileWorkCopy, 1);
+            newLen = fileWorkCopy.length();
+        }
+        return fileWorkCopy;
+    }
+
+    protected abstract String getRulesFileName(Configuration conf);
+    
+
+	public Configuration getConf() {
+		return this.conf;
+	}
+
+	public void setConf(Configuration conf) {
+		this.conf = conf;
+	    // the default constructor was called
+	    if (this.rules == null) {
+	      String filename = getRulesFileName(conf);
+	      URL url = getConf().getResource(filename);
+	      try {
+	        this.rules = readConfigurationFile(url.toString());
+	      } catch (IOException e) {
+	        // TODO mb@media-style.com: throw Exception? Because broken api.
+	        throw new RuntimeException(e.getMessage(), e);
+	      } catch (MalformedPatternException e) {
+	        // TODO mb@media-style.com: throw Exception? Because broken api.
+	        throw new RuntimeException(e.getMessage(), e);
+	      }
+	    }
+	}
+
+	/** This function does the replacements by iterating through all the regex patterns.
+	 * It accepts a string url as input and returns the altered string. 
+	 * */
+	private synchronized String regexNormalize(String urlString) {
+	   Iterator i=rules.iterator();
+	   while(i.hasNext()) {
+	 	  RegexURLNormalizerRule r = (RegexURLNormalizerRule)i.next();
+	 	  urlString = Util.substitute(matcher, r.pattern, r.substitution, 
+	 			  urlString, Util.SUBSTITUTE_ALL); // actual substitution
+	   }
+	   return urlString;
+	 }
+  
+	/** Normalizes any URLs by calling super.basicNormalize()
+	 * and regexSub(). This is the function that gets called
+	 * elsewhere in Nutch. */
+	public synchronized String normalize(String urlString) throws MalformedURLException {
+        urlString = basicNormalize(urlString); // run basicNormalize first to ready for regexNormalize
+        urlString = regexNormalize(urlString);
+        urlString = basicNormalize(urlString); // make sure regexNormalize didn't screw up the URL
+        return urlString;
+  	}
+
+	/** Reads the configuration file and populates a List of Rules. */
+	private List readConfigurationFile(String filename)
+	    throws IOException, MalformedPatternException {
+	    Perl5Compiler compiler=new Perl5Compiler();
+	    List rules=new ArrayList();
+	    try {
+	      LOG.info("loading " + filename);
+	      // borrowed heavily from code in Configuration.java
+	      Document doc =
+	        DocumentBuilderFactory.newInstance().newDocumentBuilder()
+	        .parse(filename);
+	      Element root = doc.getDocumentElement();
+	      if (!"regex-normalize".equals(root.getTagName()))
+	        LOG.severe("bad conf file: top-level element not <regex-normalize>");
+	      NodeList regexes = root.getChildNodes();
+	      for (int i = 0; i < regexes.getLength(); i++) {
+	        Node regexNode = regexes.item(i);
+	        if (!(regexNode instanceof Element))
+	          continue;
+	        Element regex = (Element)regexNode;
+	        if (!"regex".equals(regex.getTagName()))
+	          LOG.warning("bad conf file: element not <regex>");
+	        NodeList fields = regex.getChildNodes();
+	        String patternValue = null;
+	        String subValue = null;
+	        for (int j = 0; j < fields.getLength(); j++) {
+	          Node fieldNode = fields.item(j);
+	          if (!(fieldNode instanceof Element)) {
+	            continue;
+	          }
+	          Element field = (Element)fieldNode;
+	          if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) {
+	            patternValue = ((Text)field.getFirstChild()).getData();
+	          }
+	          if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) {
+	            subValue = ((Text)field.getFirstChild()).getData();
+	          }
+	          if (!field.hasChildNodes()) {
+	            subValue = "";
+	          }
+	        }
+	        if (patternValue != null && subValue != null) {
+	        	RegexURLNormalizerRule rule=new RegexURLNormalizerRule();
+	          rule.pattern=(Perl5Pattern) compiler.compile(patternValue);
+	          rule.substitution=new Perl5Substitution(subValue);
+	          rules.add(rule);
+	        }
+	      }
+	    } catch (Exception e) {
+	      LOG.severe("error parsing " + filename +" conf file: " + e);
+	    }
+	    return rules;
+	  }
+}
\ No newline at end of file
Index: src/plugin/lib-regex-normalizer/plugin.xml
===================================================================
--- src/plugin/lib-regex-normalizer/plugin.xml	(revision 0)
+++ src/plugin/lib-regex-normalizer/plugin.xml	(revision 0)
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ! A common framework for RegExp based URL normalizers
+ !-->
+<plugin
+   id="lib-regex-normalizer"
+   name="Regex URL Normalizer Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-regex-normalizer.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>
Index: src/plugin/lib-regex-normalizer/build.xml
===================================================================
--- src/plugin/lib-regex-normalizer/build.xml	(revision 0)
+++ src/plugin/lib-regex-normalizer/build.xml	(revision 0)
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="lib-regex-normalizer" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>