--- nutch.bak/src/java/org/apache/nutch/net/RegexUrlNormalizer.java	2006-10-12 06:48:42.000000000 -0700
+++ nutch/src/java/org/apache/nutch/net/RegexUrlNormalizer.java	2006-11-29 10:57:24.000000000 -0800
@@ -18,14 +18,19 @@
 
 import java.net.URL;
 import java.net.MalformedURLException;
+import java.net.URLDecoder;
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
 
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.HashSet;
 
 import org.apache.hadoop.conf.Configuration;
-
+import org.apache.hadoop.io.UTF8;
 import org.apache.nutch.util.NutchConfiguration;
 
 import javax.xml.parsers.*;
@@ -49,11 +54,15 @@
     /** Class which holds a compiled pattern and its corresponding substition string. */
     private static class Rule {
       public Perl5Pattern pattern;
-      public String substitution;	
+      public String substitution;
+      public HashSet hosts;           // set of hosts to apply this Rule to; null means "apply to all"
+      public boolean unescape;        // true implies "if the pattern matches, unescape the result" 
     }
     
     private List rules;
     private PatternMatcher matcher = new Perl5Matcher();
+    private URLDecoder decoder = new URLDecoder();
+    private int maxIterations = 1;    // number of iterations to apply rules.
     
     /** The default constructor which is called from UrlNormalizerFactory (normalizerClass.newInstance()) in method: getNormalizer()**/
     public RegexUrlNormalizer()  {}
@@ -67,25 +76,61 @@
     
     
     /** This function does the replacements by iterating through all the regex patterns.
-      * It accepts a string url as input and returns the altered string. */
-    public synchronized String regexNormalize(String urlString) {
-      Iterator i=rules.iterator();
-      while(i.hasNext()) {
-        Rule r=(Rule) i.next();
-        urlString = Util.substitute(matcher, r.pattern, 
-          new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution
-      }
-      return urlString;
-    }
+     * It accepts a string url as input and returns the altered string. */
+   public synchronized String regexNormalize(String urlString) {
+     Iterator i=rules.iterator();
+     int loop = 0;
+     String origStringPreChain;
+     
+     do {
+       origStringPreChain = urlString;
+       
+       while(i.hasNext()) {
+         String origStringPreRule = urlString;
+         Rule r=(Rule) i.next();
+         if (r.hosts != null) {
+           URL url=null;
+           try {
+             url = new URL(urlString);
+           }
+           catch (MalformedURLException e) {
+           }
+           if (url != null && !r.hosts.contains(url.getHost())) {
+             continue;
+           }
+         }
+         urlString = Util.substitute(matcher, r.pattern, 
+             new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution
+         if (urlString != origStringPreRule && r.unescape) {
+           try {
+             urlString = decoder.decode(urlString, "UTF-8");
+           }
+           catch (UnsupportedEncodingException e) {
+             if (LOG.isErrorEnabled()) {
+               LOG.error("URL Decoding failed - UTF-8 unsupported");
+             }
+           }
+         }
+       }
+     } while (++loop < maxIterations && origStringPreChain != urlString);
+     
+     return urlString;
+   }
    
     /** Normalizes any URLs by calling super.basicNormalize()
       * and regexSub(). This is the function that gets called
       * elsewhere in Nutch. */
     public synchronized String normalize(String urlString)
       throws MalformedURLException {
+
+    	
         urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize
+        String oldUrlString = new String(urlString);
+        
         urlString = regexNormalize(urlString);
-        urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL
+        if (!urlString.equals(oldUrlString)) {
+        	urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL
+        }
         return urlString;
   }
   
@@ -121,6 +166,9 @@
         NodeList fields = regex.getChildNodes();
         String patternValue = null;
         String subValue = null;
+        HashSet hosts = null;
+        boolean unescape = false;
+        
         for (int j = 0; j < fields.getLength(); j++) {
           Node fieldNode = fields.item(j);
           if (!(fieldNode instanceof Element))
@@ -130,6 +178,19 @@
             patternValue = ((Text)field.getFirstChild()).getData();
           if ("substitution".equals(field.getTagName()) && field.hasChildNodes())
             subValue = ((Text)field.getFirstChild()).getData();
+          if ("host".equals(field.getTagName()) && field.hasChildNodes()) {
+            if (hosts == null) {
+              hosts = new HashSet();
+            }
+            hosts.add(((Text)field.getFirstChild()).getData());
+          }
+          if ("options".equals(field.getTagName()) && field.hasChildNodes()) {
+            String option = (((Text)field.getFirstChild()).getData()).trim();
+            
+            if ("unescape".equals(option)) {
+              unescape = true;
+            }
+          }
           if (!field.hasChildNodes())
             subValue = "";
         }
@@ -137,6 +198,8 @@
           Rule rule=new Rule();
           rule.pattern=(Perl5Pattern) compiler.compile(patternValue);
           rule.substitution=subValue;
+          rule.hosts = hosts;
+          rule.unescape = unescape;
           rules.add(rule);
         }
       }
@@ -152,6 +215,7 @@
   public void setConf(Configuration conf) {
     super.setConf(conf);
     // the default constructor was called
+    maxIterations = getConf().getInt("urlnormalizer.regex.maxiterations",1);
     if (this.rules == null) {
       String filename = getConf().get("urlnormalizer.regex.file");
       URL url = getConf().getResource(filename);
@@ -168,16 +232,16 @@
 
   }
     
-  /** Spits out patterns and substitutions that are in the configuration file. */
+  /** Normalize the URLs from stdin; useful for testing */
   public static void main(String args[])
     throws MalformedPatternException, IOException {
       RegexUrlNormalizer normalizer = new RegexUrlNormalizer();
       normalizer.setConf(NutchConfiguration.create());
-      Iterator i=normalizer.rules.iterator();
-      while(i.hasNext()) {
-        Rule r=(Rule) i.next();
-        System.out.print(r.pattern.getPattern() + "  ");
-        System.out.println(r.substitution);
+      BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+      String line;
+
+      while((line=in.readLine())!=null) {
+        System.out.println(normalizer.normalize(line));
       }
     }
   
--- nutch.bak/conf/nutch-default.xml	2006-11-25 15:51:22.000000000 -0800
+++ nutch/conf/nutch-default.xml	2006-11-29 11:27:28.000000000 -0800
@@ -620,6 +620,13 @@
   <value>regex-normalize.xml</value>
   <description>Name of the config file used by the RegexUrlNormalizer class.</description></property>
 
+<property>
+  <name>urlnormalizer.regex.maxiterations</name>
+  <value>1</value>
+  <description>Maximum number of iterations to run of the RegexUrlNormalizer. The normalizer
+  only iterates if the previous iteration changes the input URL.</description>
+</property>
+
 <!-- mime properties -->
 
 <property>
