Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 1693239)
+++ conf/nutch-default.xml	(working copy)
@@ -550,6 +550,16 @@
   <value>false</value>
   <description>If true, outlinks leading from a page to external hosts
   will be ignored. This is an effective way to limit the crawl to include
+  only initially injected hosts, without creating complex URLFilters. This parameter
+  takes precedence over db.ignore.external.links.domain.
+  </description>
+</property>
+
+<property>
+  <name>db.ignore.external.links.domain</name>
+  <value>false</value>
+  <description>If true, outlinks leading from a page to external domains
+  will be ignored. This is an effective way to limit the crawl to include
   only initially injected hosts, without creating complex URLFilters.
   </description>
 </property>
Index: src/java/org/apache/nutch/fetcher/FetcherThread.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherThread.java	(revision 1693239)
+++ src/java/org/apache/nutch/fetcher/FetcherThread.java	(working copy)
@@ -69,9 +69,10 @@
  * This class picks items from queues and fetches the pages.
  */
 public class FetcherThread extends Thread {
-  
-  private static final Logger LOG = LoggerFactory.getLogger(FetcherThread.class);
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(FetcherThread.class);
+
   private Configuration conf;
   private URLFilters urlFilters;
   private ScoringFilters scfilters;
@@ -93,6 +94,7 @@
   private int maxOutlinkDepth;
   private int maxOutlinkDepthNumLinks;
   private boolean outlinksIgnoreExternal;
+  private boolean ignoreLinksOutsideDomain;
 
   private int outlinksDepthDivisor;
   private boolean skipTruncated;
@@ -124,15 +126,17 @@
   private AtomicInteger pages;
 
   private AtomicLong bytes;
-  
-  //Used by the REST service
+
+  // Used by the REST service
   private FetchNode fetchNode;
   private boolean reportToNutchServer;
 
-  public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, 
-      QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, Reporter reporter,
-      AtomicInteger errors, String segmentName, boolean parsing, OutputCollector<Text, NutchWritable> output,
-      boolean storingContent, AtomicInteger pages, AtomicLong bytes) {
+  public FetcherThread(Configuration conf, AtomicInteger activeThreads,
+      FetchItemQueues fetchQueues, QueueFeeder feeder,
+      AtomicInteger spinWaiting, AtomicLong lastRequestStart, Reporter reporter,
+      AtomicInteger errors, String segmentName, boolean parsing,
+      OutputCollector<Text, NutchWritable> output, boolean storingContent,
+      AtomicInteger pages, AtomicLong bytes) {
     this.setDaemon(true); // don't hang JVM on exit
     this.setName("FetcherThread"); // use an informative name
     this.conf = conf;
@@ -156,14 +160,13 @@
     this.storingContent = storingContent;
     this.pages = pages;
     this.bytes = bytes;
-    queueMode = conf.get("fetcher.queue.mode",
-        FetchItemQueues.QUEUE_MODE_HOST);
+    queueMode = conf.get("fetcher.queue.mode", FetchItemQueues.QUEUE_MODE_HOST);
     // check that the mode is known
     if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP)
         && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
         && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
-      LOG.error("Unknown partition mode : " + queueMode
-          + " - forcing to byHost");
+      LOG.error(
+          "Unknown partition mode : " + queueMode + " - forcing to byHost");
       queueMode = FetchItemQueues.QUEUE_MODE_HOST;
     }
     LOG.info("Using queue mode : " + queueMode);
@@ -170,7 +173,8 @@
     this.maxRedirect = conf.getInt("http.redirect.max", 3);
     this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
         false);
-
+    this.ignoreLinksOutsideDomain = conf
+        .getBoolean("db.ignore.external.links.domain", false);
     maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
     maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
         : maxOutlinksPerPage;
@@ -177,12 +181,12 @@
     interval = conf.getInt("db.fetch.interval.default", 2592000);
     ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
     maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
-    outlinksIgnoreExternal = conf.getBoolean(
-        "fetcher.follow.outlinks.ignore.external", false);
-    maxOutlinkDepthNumLinks = conf.getInt(
-        "fetcher.follow.outlinks.num.links", 4);
-    outlinksDepthDivisor = conf.getInt(
-        "fetcher.follow.outlinks.depth.divisor", 2);
+    outlinksIgnoreExternal = conf
+        .getBoolean("fetcher.follow.outlinks.ignore.external", false);
+    maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links",
+        4);
+    outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor",
+        2);
   }
 
   @SuppressWarnings("fallthrough")
@@ -194,7 +198,7 @@
       // checking for the server to be running and fetcher.parse to be true
       if (parsing && NutchServer.getInstance().isRunning())
         reportToNutchServer = true;
-      
+
       while (true) {
         // creating FetchNode for storing in FetchNodeDb
         if (reportToNutchServer)
@@ -211,7 +215,8 @@
 
         fit = ((FetchItemQueues) fetchQueues).getFetchItem();
         if (fit == null) {
-          if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
+          if (feeder.isAlive()
+              || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
             LOG.debug(getName() + " spin-waiting ...");
             // spin-wait.
             ((AtomicInteger) spinWaiting).incrementAndGet();
@@ -228,8 +233,8 @@
           }
         }
         lastRequestStart.set(System.currentTimeMillis());
-        Text reprUrlWritable = (Text) fit.datum.getMetaData().get(
-            Nutch.WRITABLE_REPR_URL_KEY);
+        Text reprUrlWritable = (Text) fit.datum.getMetaData()
+            .get(Nutch.WRITABLE_REPR_URL_KEY);
         if (reprUrlWritable == null) {
           setReprUrl(fit.url.toString());
         } else {
@@ -242,7 +247,8 @@
           do {
             if (LOG.isInfoEnabled()) {
               LOG.info("fetching " + fit.url + " (queue crawl delay="
-                  + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
+                  + ((FetchItemQueues) fetchQueues)
+                      .getFetchItemQueue(fit.queueID).crawlDelay
                   + "ms)");
             }
             if (LOG.isDebugEnabled()) {
@@ -249,8 +255,8 @@
               LOG.debug("redirectCount=" + redirectCount);
             }
             redirecting = false;
-            Protocol protocol = this.protocolFactory.getProtocol(fit.url
-                .toString());
+            Protocol protocol = this.protocolFactory
+                .getProtocol(fit.url.toString());
             BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
             if (!rules.isAllowed(fit.u.toString())) {
               // unblock
@@ -296,7 +302,7 @@
             ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
 
             String urlString = fit.url.toString();
-            
+
             // used for FetchNode
             if (fetchNode != null) {
               fetchNode.setStatus(status.getCode());
@@ -355,8 +361,8 @@
 
             case ProtocolStatus.EXCEPTION:
               logError(fit.url, status.getMessage());
-              int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit
-                  .getQueueID());
+              int killedURLs = ((FetchItemQueues) fetchQueues)
+                  .checkExceptionThreshold(fit.getQueueID());
               if (killedURLs != 0)
                 reporter.incrCounter("FetcherStatus",
                     "AboveExceptionThresholdInQueue", killedURLs);
@@ -424,7 +430,7 @@
 
   private Text handleRedirect(Text url, CrawlDatum datum, String urlString,
       String newUrl, boolean temp, String redirType)
-      throws MalformedURLException, URLFilterException {
+          throws MalformedURLException, URLFilterException {
     newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
     newUrl = urlFilters.filter(newUrl);
 
@@ -434,9 +440,8 @@
         String newHost = new URL(newUrl).getHost().toLowerCase();
         if (!origHost.equals(newHost)) {
           if (LOG.isDebugEnabled()) {
-            LOG.debug(" - ignoring redirect " + redirType + " from "
-                + urlString + " to " + newUrl
-                + " because external links are ignored");
+            LOG.debug(" - ignoring redirect " + redirType + " from " + urlString
+                + " to " + newUrl + " because external links are ignored");
           }
           return null;
         }
@@ -451,8 +456,8 @@
         redirecting = true;
         redirectCount++;
         if (LOG.isDebugEnabled()) {
-          LOG.debug(" - " + redirType + " redirect to " + url
-              + " (fetching now)");
+          LOG.debug(
+              " - " + redirType + " redirect to " + url + " (fetching now)");
         }
         return url;
       } else {
@@ -471,8 +476,8 @@
         }
         output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
         if (LOG.isDebugEnabled()) {
-          LOG.debug(" - " + redirType + " redirect to " + url
-              + " (fetching later)");
+          LOG.debug(
+              " - " + redirType + " redirect to " + url + " (fetching later)");
         }
         return null;
       }
@@ -498,13 +503,13 @@
     }
     fit = FetchItem.create(redirUrl, newDatum, queueMode);
     if (fit != null) {
-      FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
+      FetchItemQueue fiq = ((FetchItemQueues) fetchQueues)
+          .getFetchItemQueue(fit.queueID);
       fiq.addInProgressFetchItem(fit);
     } else {
       // stop redirecting
       redirecting = false;
-      reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect",
-          1);
+      reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
     }
   }
 
@@ -549,8 +554,8 @@
         }
       }
       /*
-       * Note: Fetcher will only follow meta-redirects coming from the
-       * original URL.
+       * Note: Fetcher will only follow meta-redirects coming from the original
+       * URL.
        */
       if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
         if (!skipTruncated
@@ -616,6 +621,7 @@
           }
 
           String fromHost;
+          String fromDomain;
 
           // collect outlinks for subsequent db update
           Outlink[] links = parseData.getOutlinks();
@@ -629,12 +635,23 @@
           } else {
             fromHost = null;
           }
-          
-          //used by fetchNode         
-          if(fetchNode!=null){
+
+          if (ignoreLinksOutsideDomain) {
+            try {
+              fromDomain = URLUtil.getDomainName(url.toString()).toLowerCase();
+            } catch (MalformedURLException e) {
+              fromDomain = null;
+            }
+          } else {
+            fromDomain = null;
+          }
+
+          // used by fetchNode
+          if (fetchNode != null) {
             fetchNode.setOutlinks(links);
             fetchNode.setTitle(parseData.getTitle());
-            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
+            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(),
+                fetchNode);
           }
           int validCount = 0;
 
@@ -641,11 +658,13 @@
           // Process all outlinks, normalize, filter and deduplicate
           List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
           HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
-          for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
+          for (int i = 0; i < links.length
+              && validCount < outlinksToStore; i++) {
             String toUrl = links[i].getToUrl();
 
             toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
-                fromHost, ignoreExternalLinks, urlFilters, normalizers);
+                fromHost, fromDomain, ignoreExternalLinks,
+                ignoreLinksOutsideDomain, urlFilters, normalizers);
             if (toUrl == null) {
               continue;
             }
@@ -678,19 +697,18 @@
 
               // Check whether we'll follow external outlinks
               if (outlinksIgnoreExternal) {
-                if (!URLUtil.getHost(url.toString()).equals(
-                    URLUtil.getHost(followUrl))) {
+                if (!URLUtil.getHost(url.toString())
+                    .equals(URLUtil.getHost(followUrl))) {
                   continue;
                 }
               }
 
-              reporter
-                  .incrCounter("FetcherOutlinks", "outlinks_following", 1);
+              reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);
 
               // Create new FetchItem with depth incremented
               FetchItem fit = FetchItem.create(new Text(followUrl),
-                  new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
-                  queueMode, outlinkDepth + 1);
+                  new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode,
+                  outlinkDepth + 1);
               ((FetchItemQueues) fetchQueues).addFetchItem(fit);
 
               outlinkCounter++;
@@ -699,11 +717,12 @@
 
           // Overwrite the outlinks in ParseData with the normalized and
           // filtered set
-          parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList
-              .size()]));
+          parseData.setOutlinks(
+              outlinkList.toArray(new Outlink[outlinkList.size()]));
 
-          output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
-              parse.getText()), parseData, parse.isCanonical())));
+          output.collect(url,
+              new NutchWritable(new ParseImpl(new ParseText(parse.getText()),
+                  parseData, parse.isCanonical())));
         }
       }
     } catch (IOException e) {
@@ -716,14 +735,14 @@
     if (parseResult != null && !parseResult.isEmpty()) {
       Parse p = parseResult.get(content.getUrl());
       if (p != null) {
-        reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p
-            .getData().getStatus().getMajorCode()], 1);
+        reporter.incrCounter("ParserStatus",
+            ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
         return p.getData().getStatus();
       }
     }
     return null;
   }
-  
+
   private void updateStatus(int bytesInPage) throws IOException {
     pages.incrementAndGet();
     bytes.addAndGet(bytesInPage);
@@ -740,10 +759,10 @@
   public String getReprUrl() {
     return reprUrl;
   }
-  
+
   private void setReprUrl(String urlString) {
     this.reprUrl = urlString;
-    
+
   }
 
 }
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java	(revision 1693239)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java	(working copy)
@@ -18,33 +18,40 @@
 package org.apache.nutch.parse;
 
 // Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
 
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.InvalidJobConfException;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Progressable;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.net.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import java.io.*;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.util.Progressable;
-
 /* Parse content in a segment. */
 public class ParseOutputFormat implements OutputFormat<Text, Parse> {
   private static final Logger LOG = LoggerFactory
@@ -102,8 +109,10 @@
 
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
-    final boolean ignoreExternalLinks = job.getBoolean(
-        "db.ignore.external.links", false);
+    final boolean ignoreExternalLinks = job
+        .getBoolean("db.ignore.external.links", false);
+    final boolean ignoreLinksOutsideDomain = job
+        .getBoolean("db.ignore.external.links.domain", false);
     int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
     final boolean isParsing = job.getBoolean("fetcher.parse", true);
     final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
@@ -134,6 +143,7 @@
 
         String fromUrl = key.toString();
         String fromHost = null;
+        String fromDomain = null;
         textOut.append(key, new ParseText(parse.getText()));
 
         ParseData parseData = parse.getData();
@@ -176,14 +186,24 @@
           fromHost = null;
         }
 
+        if (ignoreLinksOutsideDomain) {
+          try {
+            fromDomain = URLUtil.getDomainName(fromUrl).toLowerCase();
+          } catch (MalformedURLException e) {
+            fromDomain = null;
+          }
+        } else {
+          fromDomain = null;
+        }
+
         ParseStatus pstatus = parseData.getStatus();
         if (pstatus != null && pstatus.isSuccess()
             && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
           String newUrl = pstatus.getMessage();
           int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
-          newUrl = filterNormalize(fromUrl, newUrl, fromHost,
-              ignoreExternalLinks, filters, normalizers,
-              URLNormalizers.SCOPE_FETCHER);
+          newUrl = filterNormalize(fromUrl, newUrl, fromHost, fromDomain,
+              ignoreExternalLinks, ignoreLinksOutsideDomain, filters,
+              normalizers, URLNormalizers.SCOPE_FETCHER);
 
           if (newUrl != null) {
             String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl,
@@ -213,13 +233,15 @@
           // Only normalize and filter if fetcher.parse = false
           if (!isParsing) {
             toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
-                ignoreExternalLinks, filters, normalizers);
+                fromDomain, ignoreExternalLinks, ignoreLinksOutsideDomain,
+                filters, normalizers);
             if (toUrl == null) {
               continue;
             }
           }
 
-          CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
+          CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED,
+              interval);
           Text targetUrl = new Text(toUrl);
 
           // see if the outlink has any metadata attached
@@ -251,8 +273,8 @@
           adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets,
               null, links.length);
         } catch (ScoringFilterException e) {
-          LOG.warn("Cannot distribute score from " + key + ": "
-              + e.getMessage());
+          LOG.warn(
+              "Cannot distribute score from " + key + ": " + e.getMessage());
         }
         for (Entry<Text, CrawlDatum> target : targets) {
           crawlOut.append(target.getKey(), target.getValue());
@@ -260,10 +282,11 @@
         if (adjust != null)
           crawlOut.append(key, adjust);
 
-        Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList
-            .size()]);
+        Outlink[] filteredLinks = outlinkList
+            .toArray(new Outlink[outlinkList.size()]);
         parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
-            filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
+            filteredLinks, parseData.getContentMeta(),
+            parseData.getParseMeta());
         dataOut.append(key, parseData);
         if (!parse.isCanonical()) {
           CrawlDatum datum = new CrawlDatum();
@@ -291,20 +314,23 @@
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
-      String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+      String fromHost, String fromDomain, boolean ignoreExternalHost,
+      boolean ignoreExternalDomain, URLFilters filters,
       URLNormalizers normalizers) {
-    return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
-        filters, normalizers, URLNormalizers.SCOPE_OUTLINK);
+    return filterNormalize(fromUrl, toUrl, fromHost, fromDomain,
+        ignoreExternalHost, ignoreExternalDomain, filters, normalizers,
+        URLNormalizers.SCOPE_OUTLINK);
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
-      String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+      String fromHost, String fromDomain, boolean ignoreExternalHost,
+      boolean ignoreExternalDomain, URLFilters filters,
       URLNormalizers normalizers, String urlNormalizerScope) {
     // ignore links to self (or anchors within the page)
     if (fromUrl.equals(toUrl)) {
       return null;
     }
-    if (ignoreExternalLinks) {
+    if (ignoreExternalHost) {
       String toHost;
       try {
         toHost = new URL(toUrl).getHost().toLowerCase();
@@ -315,6 +341,17 @@
         return null; // skip it
       }
     }
+    if (ignoreExternalDomain) {
+      String toDomain;
+      try {
+        toDomain = URLUtil.getDomainName(toUrl).toLowerCase();
+      } catch (MalformedURLException e) {
+        toDomain = null;
+      }
+      if (toDomain == null || !toDomain.equals(fromDomain)) { // external links
+        return null; // skip it
+      }
+    }
     try {
       if (normalizers != null) {
         toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
@@ -333,4 +370,4 @@
     return toUrl;
   }
 
-}
+}
\ No newline at end of file
