Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1730536)
+++ conf/nutch-default.xml (working copy)
@@ -539,11 +539,11 @@
* If more than one LinkDb contains information about the same URL, all inlinks
- * are accumulated, but only at most db.max.inlinks
inlinks will
+ * are accumulated, but only at most linkdb.max.inlinks
inlinks will
* ever be added.
*
@@ -104,7 +104,7 @@ } public void configure(JobConf job) { - maxInlinks = job.getInt("db.max.inlinks", 10000); + maxInlinks = job.getInt("linkdb.max.inlinks", 10000); } public void close() throws IOException { Index: src/java/org/apache/nutch/fetcher/FetcherThread.java =================================================================== --- src/java/org/apache/nutch/fetcher/FetcherThread.java (revision 1730536) +++ src/java/org/apache/nutch/fetcher/FetcherThread.java (working copy) @@ -84,8 +84,10 @@ private String reprUrl; private boolean redirecting; private int redirectCount; + private boolean ignoreInternalLinks; private boolean ignoreExternalLinks; private String ignoreExternalLinksMode; + private boolean ignoreTreatRedirectsAsLinks; // Used by fetcher.follow.outlinks.depth in parse private int maxOutlinksPerPage; @@ -174,8 +176,10 @@ maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; interval = conf.getInt("db.fetch.interval.default", 2592000); + ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false); ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false); ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost"); + ignoreTreatRedirectsAsLinks = conf.getBoolean("db.ignore.treat.redirects.as.links", true); maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1); outlinksIgnoreExternal = conf.getBoolean( "fetcher.follow.outlinks.ignore.external", false); @@ -428,22 +432,34 @@ newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); - if (ignoreExternalLinks) { + if (ignoreTreatRedirectsAsLinks) { try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); - if (!origHost.equals(newHost)) { - if (LOG.isDebugEnabled()) { - LOG.debug(" - ignoring redirect " + redirType + " from " - + urlString + " to " + newUrl - + " because external links are ignored"); + if (ignoreExternalLinks) { + if (!origHost.equals(newHost)) { + if (LOG.isDebugEnabled()) { + LOG.debug(" - ignoring redirect " + redirType + " from " + + urlString + " to " + newUrl + + " because external links are ignored"); + } + return null; } - return null; } - } catch (MalformedURLException e) { - } + + if (ignoreInternalLinks) { + if (origHost.equals(newHost)) { + if (LOG.isDebugEnabled()) { + LOG.debug(" - ignoring redirect " + redirType + " from " + + urlString + " to " + newUrl + + " because internal links are ignored"); + } + return null; + } + } + } catch (MalformedURLException e) { } } - + if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); @@ -621,7 +637,7 @@ // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); - if (ignoreExternalLinks) { + if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { @@ -648,7 +664,7 @@ String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, - origin, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, normalizers); + origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, normalizers); if (toUrl == null) { continue; } Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java =================================================================== --- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 1730536) +++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy) @@ -102,6 +102,8 @@ this.scfilters = new ScoringFilters(job); final int interval = job.getInt("db.fetch.interval.default", 2592000); + final boolean ignoreInternalLinks = job.getBoolean( + "db.ignore.internal.links", false); final boolean ignoreExternalLinks = job.getBoolean( "db.ignore.external.links", false); final String ignoreExternalLinksMode = job.get( @@ -189,7 +191,7 @@ crawlOut.append(key, parseMDCrawlDatum); // need to determine origin (once for all outlinks) - if (ignoreExternalLinks) { + if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(fromUrl.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { @@ -207,7 +209,7 @@ String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); newUrl = filterNormalize(fromUrl, newUrl, origin, - ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers, + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers, URLNormalizers.SCOPE_FETCHER); if (newUrl != null) { @@ -238,7 +240,7 @@ // Only normalize and filter if fetcher.parse = false if (!isParsing) { toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, - ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers); + ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers); if (toUrl == null) { continue; } @@ -316,22 +318,22 @@ } public static String filterNormalize(String fromUrl, String toUrl, - String fromHost, boolean ignoreExternalLinks, + String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, URLNormalizers normalizers) { - return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks, + return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers, URLNormalizers.SCOPE_OUTLINK); } public static String filterNormalize(String fromUrl, String toUrl, - String origin, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, + String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters, URLNormalizers normalizers, String urlNormalizerScope) { // ignore links to self (or anchors within the page) if (fromUrl.equals(toUrl)) { return null; } - if (ignoreExternalLinks) { + if (ignoreExternalLinks || ignoreInternalLinks) { URL targetURL = null; try { targetURL = new URL(toUrl); @@ -338,15 +340,30 @@ } catch (MalformedURLException e1) { return null; // skip it } - if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { - String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); - if (toDomain == null || !toDomain.equals(origin)) { - return null; // skip it + if (ignoreExternalLinks) { + if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { + String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + if (toDomain == null || !toDomain.equals(origin)) { + return null; // skip it + } + } else { + String toHost = targetURL.getHost().toLowerCase(); + if (toHost == null || !toHost.equals(origin)) { + return null; // skip it + } } - } else { - String toHost = targetURL.getHost().toLowerCase(); - if (toHost == null || !toHost.equals(origin)) { - return null; // skip it + } + if (ignoreInternalLinks) { + if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { + String toDomain = URLUtil.getDomainName(targetURL).toLowerCase(); + if (toDomain == null || toDomain.equals(origin)) { + return null; // skip it + } + } else { + String toHost = targetURL.getHost().toLowerCase(); + if (toHost == null || toHost.equals(origin)) { + return null; // skip it + } } } }