Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java	(revision 1206118)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java	(working copy)
@@ -21,12 +21,14 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hadoop.conf.*;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.fetcher.Fetcher;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.mapred.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.StringUtil;
@@ -50,7 +52,7 @@
   private URLFilters filters;
   private URLNormalizers normalizers;
   private ScoringFilters scfilters;
-  
+
   private static class SimpleEntry implements Entry<Text, CrawlDatum> {
     private Text key;
     private CrawlDatum value;
@@ -90,12 +92,14 @@
   public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
                                       String name, Progressable progress) throws IOException {
 
-    this.filters = new URLFilters(job);
-    this.normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+    filters = new URLFilters(job);
+    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
     final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
     int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
+    final boolean isParsing = job.getBoolean("fetcher.parse", true);
     final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
                                                      : maxOutlinksPerPage;
     final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);
@@ -127,7 +131,6 @@
           
           String fromUrl = key.toString();
           String fromHost = null; 
-          String toHost = null;          
           textOut.append(key, new ParseText(parse.getText()));
           
           ParseData parseData = parse.getData();
@@ -191,15 +194,15 @@
           // collect outlinks for subsequent db update
           Outlink[] links = parseData.getOutlinks();
           int outlinksToStore = Math.min(maxOutlinks, links.length);
-          if (ignoreExternalLinks) {
-            try {
-              fromHost = new URL(fromUrl).getHost().toLowerCase();
-            } catch (MalformedURLException e) {
-              fromHost = null;
-            }
-          } else {
-            fromHost = null;
-          }
+//           if (ignoreExternalLinks) {
+//             try {
+//               fromHost = new URL(fromUrl).getHost().toLowerCase();
+//             } catch (MalformedURLException e) {
+//               fromHost = null;
+//             }
+//           } else {
+//             fromHost = null;
+//           }
 
           int validCount = 0;
           CrawlDatum adjust = null;
@@ -207,40 +210,25 @@
           List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
           for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
             String toUrl = links[i].getToUrl();
-            // ignore links to self (or anchors within the page)
-            if (fromUrl.equals(toUrl)) {
-              continue;
-            }
-            if (ignoreExternalLinks) {
-              try {
-                toHost = new URL(toUrl).getHost().toLowerCase();
-              } catch (MalformedURLException e) {
-                toHost = null;
-              }
-              if (toHost == null || !toHost.equals(fromHost)) { // external links
-                continue; // skip it
-              }
-            }
-            try {
-              toUrl = normalizers.normalize(toUrl,
-                          URLNormalizers.SCOPE_OUTLINK); // normalize the url
-              toUrl = filters.filter(toUrl);   // filter the url
+
+            // Only normalize and filter if fetcher.parse = false
+            if (!isParsing) {
+              toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, filters, normalizers);
               if (toUrl == null) {
                 continue;
               }
-            } catch (Exception e) {
-              continue;
             }
+
             CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
             Text targetUrl = new Text(toUrl);
             try {
               scfilters.initialScore(targetUrl, target);
             } catch (ScoringFilterException e) {
               LOG.warn("Cannot filter init score for url " + key +
-                       ", using default: " + e.getMessage());
+                      ", using default: " + e.getMessage());
               target.setScore(0.0f);
             }
-            
+
             targets.add(new SimpleEntry(targetUrl, target));
 
             // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
@@ -248,6 +236,7 @@
             outlinkList.add(links[i]);
             validCount++;
           }
+
           try {
             // compute score contributions and adjustment to the original score
             adjust = scfilters.distributeScoreToOutlinks((Text)key, parseData, 
@@ -289,4 +278,34 @@
     
   }
 
+  public static String filterNormalize(String fromUrl, String toUrl, String fromHost, boolean ignoreExternalLinks, URLFilters filters, URLNormalizers normalizers) {
+    // ignore links to self (or anchors within the page)
+    if (fromUrl.equals(toUrl)) {
+      return null;
+    }
+    if (ignoreExternalLinks) {
+      String toHost;
+      try {
+        toHost = new URL(toUrl).getHost().toLowerCase();
+      } catch (MalformedURLException e) {
+        toHost = null;
+      }
+      if (toHost == null || !toHost.equals(fromHost)) { // external links
+        return null; // skip it
+      }
+    }
+    try {
+      toUrl = normalizers.normalize(toUrl,
+                  URLNormalizers.SCOPE_OUTLINK); // normalize the url
+      toUrl = filters.filter(toUrl);   // filter the url
+      if (toUrl == null) {
+        return null;
+      }
+    } catch (Exception e) {
+      return null;
+    }
+
+    return toUrl;
+  }
+
 }
