Index: src/test/org/apache/nutch/analysis/TestUrlTokenizer.java
===================================================================
--- src/test/org/apache/nutch/analysis/TestUrlTokenizer.java	(revision 0)
+++ src/test/org/apache/nutch/analysis/TestUrlTokenizer.java	(revision 0)
@@ -0,0 +1,129 @@
+package org.apache.nutch.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+/**
+ * Test case for {@link UrlTokenizer}
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ */
+public class TestUrlTokenizer extends TestCase {
+
+	String[] testUrls = {"http://lucene.apache.org/nutch/"
+			,"http://www.sigir2007.org/venue.html#test"
+			, "http://java.sun.com/-j2se//////1.5.0/docs/api/"
+			, "http://www.google.com.tr/firefox?client=firefox-a&rls=org.mozilla:en-US:official"
+			}; 
+	
+	
+	public void test() {
+		try {
+			
+			org.apache.lucene.analysis.Token t;
+			
+			//test nutch url
+			String url = testUrls[0];
+			UrlTokenizer tokenizer = new UrlTokenizer(new StringReader(url));
+				
+			t= tokenizer.next();
+			assertEquals("http", t.termText());
+			t= tokenizer.next();
+			assertEquals("lucene", t.termText());
+			t= tokenizer.next();
+			assertEquals("apache", t.termText());
+			t= tokenizer.next();
+			assertEquals("org", t.termText());
+			t= tokenizer.next();
+			assertEquals("nutch", t.termText());
+			assertEquals(tokenizer.next(), null);
+			
+			//test sigir url
+			url = testUrls[1];
+			tokenizer = new UrlTokenizer(new StringReader(url));
+				
+			t= tokenizer.next();
+			assertEquals("http", t.termText());
+			t= tokenizer.next();
+			assertEquals("www", t.termText());
+			t= tokenizer.next();
+			assertEquals("sigir2007", t.termText());
+			t= tokenizer.next();
+			assertEquals("org", t.termText());
+			t= tokenizer.next();
+			assertEquals("venue", t.termText());
+			t= tokenizer.next();
+			assertEquals("html", t.termText());
+			t= tokenizer.next();
+			assertEquals("test", t.termText());
+			assertEquals(tokenizer.next(), null);
+			
+			//test jdk url
+			url = testUrls[2];
+			tokenizer = new UrlTokenizer(new StringReader(url));
+			
+			t= tokenizer.next();
+			assertEquals("http", t.termText());
+			t= tokenizer.next();
+			assertEquals("java", t.termText());
+			t= tokenizer.next();
+			assertEquals("sun", t.termText());
+			t= tokenizer.next();
+			assertEquals("com", t.termText());
+			t= tokenizer.next();
+			assertEquals("j2se", t.termText());
+			t= tokenizer.next();
+			assertEquals("1", t.termText());
+			t= tokenizer.next();
+			assertEquals("5", t.termText());
+			t= tokenizer.next();
+			assertEquals("0", t.termText());
+			t= tokenizer.next();
+			assertEquals("docs", t.termText());
+			t= tokenizer.next();
+			assertEquals("api", t.termText());
+			assertEquals(tokenizer.next(), null);
+			
+			
+			//test google mozilla home page
+			//"http://www.google.com.tr/firefox?client=firefox-a&rls=org.mozilla:en-US:official"
+			url = testUrls[3];
+			tokenizer = new UrlTokenizer(new StringReader(url));
+			
+			t= tokenizer.next();//skip
+			t= tokenizer.next();//skip
+			t= tokenizer.next();//skip
+			t= tokenizer.next();//skip
+			t= tokenizer.next();//skip
+			t= tokenizer.next();//skip
+			assertEquals("firefox", t.termText());
+			t= tokenizer.next();
+			assertEquals("client", t.termText());
+			t= tokenizer.next();
+			assertEquals("firefox", t.termText());
+			t= tokenizer.next();
+			assertEquals("a", t.termText());
+			t= tokenizer.next();
+			assertEquals("rls", t.termText());
+			t= tokenizer.next();
+			assertEquals("org", t.termText());
+			t= tokenizer.next();
+			assertEquals("mozilla", t.termText());
+			t= tokenizer.next();
+			assertEquals("en", t.termText());
+			t= tokenizer.next();
+			assertEquals("US", t.termText());
+			t= tokenizer.next();
+			assertEquals("official", t.termText());
+			assertEquals(tokenizer.next(), null);
+
+			
+		}
+		catch (IOException ex) {
+			ex.printStackTrace();
+			fail();
+		}
+	}
+
+}
Index: src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
===================================================================
--- src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java	(revision 464931)
+++ src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java	(working copy)
@@ -15,15 +15,17 @@
  */
 package org.apache.nutch.analysis;
 
-// JDK imports
+//JDK imports
 import java.io.Reader;
 import java.io.IOException;
 
-// Lucene imports
+//Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 
 /**
@@ -33,76 +35,107 @@
  */
 public class NutchDocumentAnalyzer extends NutchAnalyzer {
 
-  /** Analyzer used to index textual content. */
-  private static Analyzer CONTENT_ANALYZER;
-  // Anchor Analysis
-  // Like content analysis, but leave gap between anchors to inhibit
-  // cross-anchor phrase matching.
-  /**
-   * The number of unused term positions between anchors in the anchor field.
-   */
-  public static final int INTER_ANCHOR_GAP = 4;
-  /** Analyzer used to analyze anchors. */
-  private static Analyzer ANCHOR_ANALYZER;
-  private Configuration conf;
+	/** Analyzer used to index textual content. */
+	private static Analyzer CONTENT_ANALYZER;
+	// Anchor Analysis
+	// Like content analysis, but leave gap between anchors to inhibit
+	// cross-anchor phrase matching.
+	/**
+	 * The number of unused term positions between anchors in the anchor field.
+	 */
+	public static final int INTER_ANCHOR_GAP = 4;
+	/** Analyzer used to analyze anchors. */
+	private static Analyzer ANCHOR_ANALYZER;
+	private static Analyzer URL_ANALYZER;
+	private Configuration conf;
 
-  /**
-   * @param conf
-   */
-  public NutchDocumentAnalyzer(Configuration conf) {
-    this.conf = conf;
-    CONTENT_ANALYZER = new ContentAnalyzer(conf);
-    ANCHOR_ANALYZER = new AnchorAnalyzer();
-  }
+	private static Log LOG = LogFactory.getLog(NutchDocumentAnalyzer.class);
+	
+	/**
+	 * @param conf
+	 */
+	public NutchDocumentAnalyzer(Configuration conf) {
+		this.conf = conf;
+		CONTENT_ANALYZER = new ContentAnalyzer(conf);
+		ANCHOR_ANALYZER = new AnchorAnalyzer();
+		URL_ANALYZER = new UrlAnalyzer();
+	}
 
-  /** Analyzer used to index textual content. */
-  private static class ContentAnalyzer extends Analyzer {
-    private CommonGrams commonGrams;
+	/** Analyzer used to index textual content. */
+	private static class ContentAnalyzer extends Analyzer {
+		private CommonGrams commonGrams;
 
-    public ContentAnalyzer(Configuration conf) {
-      this.commonGrams = new CommonGrams(conf);
-    }
+		public ContentAnalyzer(Configuration conf) {
+			this.commonGrams = new CommonGrams(conf);
+		}
 
-    /** Constructs a {@link NutchDocumentTokenizer}. */
-    public TokenStream tokenStream(String field, Reader reader) {
-      return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),
-          field);
-    }
-  }
+		/** Constructs a {@link NutchDocumentTokenizer}. */
+		public TokenStream tokenStream(String field, Reader reader) {
+			return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),
+					field);
+		}
+	}
 
-  private static class AnchorFilter extends TokenFilter {
-    private boolean first = true;
+	private static class AnchorFilter extends TokenFilter {
+		private boolean first = true;
 
-    public AnchorFilter(TokenStream input) {
-      super(input);
-    }
+		public AnchorFilter(TokenStream input) {
+			super(input);
+		}
 
-    public final Token next() throws IOException {
-      Token result = input.next();
-      if (result == null)
-        return result;
-      if (first) {
-        result.setPositionIncrement(INTER_ANCHOR_GAP);
-        first = false;
-      }
-      return result;
-    }
-  }
+		public final Token next() throws IOException {
+			Token result = input.next();
+			if (result == null)
+				return result;
+			if (first) {
+				result.setPositionIncrement(INTER_ANCHOR_GAP);
+				first = false;
+			}
+			return result;
+		}
+	}
 
-  private static class AnchorAnalyzer extends Analyzer {
-    public final TokenStream tokenStream(String fieldName, Reader reader) {
-      return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
-    }
-  }
+	private static class AnchorAnalyzer extends Analyzer {
+		public final TokenStream tokenStream(String fieldName, Reader reader) {
+			return new AnchorFilter(CONTENT_ANALYZER.tokenStream(fieldName, reader));
+		}
+	}
 
-  /** Returns a new token stream for text from the named field. */
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    Analyzer analyzer;
-    if ("anchor".equals(fieldName))
-      analyzer = ANCHOR_ANALYZER;
-    else
-      analyzer = CONTENT_ANALYZER;
+	
+	
+	/**
+	 * Analyzer for field url, host and site. The class is public so that 
+	 * other analyzers can use it.
+	 * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+	 */
+	public static class UrlAnalyzer extends Analyzer {
 
-    return analyzer.tokenStream(fieldName, reader);
-  }
+		public TokenStream tokenStream(String fieldName, Reader reader) {
+			try {
+				return new UrlTokenizer(reader);
+			}
+			catch (NullPointerException ex) {
+				LOG.warn(ex);
+				return null;
+			}
+			catch (IOException ex) {
+				LOG.warn(ex);
+				return null;
+			}
+		}
+		
+	}
+
+	/** Returns a new token stream for text from the named field. */
+	public TokenStream tokenStream(String fieldName, Reader reader) {
+		Analyzer analyzer;
+		if ("anchor".equals(fieldName))
+			analyzer = ANCHOR_ANALYZER;
+		else if("url".equals(fieldName) || "site".equals(fieldName) || "host".equals(fieldName))
+			analyzer = URL_ANALYZER;
+		else
+			analyzer = CONTENT_ANALYZER;
+
+		return analyzer.tokenStream(fieldName, reader);
+	}
 }
Index: src/java/org/apache/nutch/analysis/UrlTokenizer.java
===================================================================
--- src/java/org/apache/nutch/analysis/UrlTokenizer.java	(revision 0)
+++ src/java/org/apache/nutch/analysis/UrlTokenizer.java	(revision 0)
@@ -0,0 +1,57 @@
+package org.apache.nutch.analysis;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Tokenizes the url to its fields. 
+ * 
+ * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ * @see {@link http://www.gbiv.com/protocols/uri/rfc/rfc3986.html}
+ */
+public class UrlTokenizer extends Tokenizer {
+
+	private BufferedReader bufferedReader;
+	private static Pattern pattern = Pattern.compile("[a-zA-Z0-9]+");
+	private Matcher matcher;
+	private String url;
+	private static Log LOG = LogFactory.getLog(UrlTokenizer.class);
+	
+	
+	UrlTokenizer(Reader input) throws IOException{
+		super(input);
+		
+		bufferedReader = new BufferedReader(input);
+
+		url = bufferedReader.readLine(); //get the whole url
+
+		matcher = pattern.matcher(url);
+		//@deprecated
+		//StringTokenizer stringTokenizer = new StringTokenizer(url, ":/#?+._&$-@!*\"'(),[]~;=");
+		
+	}
+	
+	
+	public Token next() throws IOException{
+		
+		if(matcher.find() == false)
+			return null;
+		else {
+			while(matcher.group().equals("")) //skip empty tokens
+				if(matcher.find() == false)
+					return null;
+			return new Token(matcher.group(), matcher.start(), matcher.end());
+		}
+			
+	}
+
+}
