Index: src/java/org/apache/nutch/fetcher/FetcherThread.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetcherThread.java (revision 1730536)
+++ src/java/org/apache/nutch/fetcher/FetcherThread.java (working copy)
@@ -84,6 +84,7 @@
private String reprUrl;
private boolean redirecting;
private int redirectCount;
+ private boolean ignoreInternalLinks;
private boolean ignoreExternalLinks;
private String ignoreExternalLinksMode;
@@ -174,6 +175,7 @@
maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
: maxOutlinksPerPage;
interval = conf.getInt("db.fetch.interval.default", 2592000);
+ ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
@@ -428,10 +430,10 @@
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
- if (ignoreExternalLinks) {
- try {
- String origHost = new URL(urlString).getHost().toLowerCase();
- String newHost = new URL(newUrl).getHost().toLowerCase();
+ try {
+ String origHost = new URL(urlString).getHost().toLowerCase();
+ String newHost = new URL(newUrl).getHost().toLowerCase();
+ if (ignoreExternalLinks) {
if (!origHost.equals(newHost)) {
if (LOG.isDebugEnabled()) {
LOG.debug(" - ignoring redirect " + redirType + " from "
@@ -440,10 +442,20 @@
}
return null;
}
- } catch (MalformedURLException e) {
}
- }
-
+
+ if (ignoreInternalLinks) {
+ if (origHost.equals(newHost)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - ignoring redirect " + redirType + " from "
+ + urlString + " to " + newUrl
+ + " because internal links are ignored");
+ }
+ return null;
+ }
+ }
+ } catch (MalformedURLException e) { }
+
if (newUrl != null && !newUrl.equals(urlString)) {
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
url = new Text(newUrl);
@@ -621,7 +633,7 @@
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
- if (ignoreExternalLinks) {
+ if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(url.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
@@ -648,7 +660,7 @@
String toUrl = links[i].getToUrl();
toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
- origin, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, normalizers);
+ origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, normalizers);
if (toUrl == null) {
continue;
}
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 1730536)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy)
@@ -102,6 +102,8 @@
this.scfilters = new ScoringFilters(job);
final int interval = job.getInt("db.fetch.interval.default", 2592000);
+ final boolean ignoreInternalLinks = job.getBoolean(
+ "db.ignore.internal.links", false);
final boolean ignoreExternalLinks = job.getBoolean(
"db.ignore.external.links", false);
final String ignoreExternalLinksMode = job.get(
@@ -189,7 +191,7 @@
crawlOut.append(key, parseMDCrawlDatum);
// need to determine origin (once for all outlinks)
- if (ignoreExternalLinks) {
+ if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(fromUrl.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
@@ -207,7 +209,7 @@
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
newUrl = filterNormalize(fromUrl, newUrl, origin,
- ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers,
+ ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers,
URLNormalizers.SCOPE_FETCHER);
if (newUrl != null) {
@@ -238,7 +240,7 @@
// Only normalize and filter if fetcher.parse = false
if (!isParsing) {
toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
- ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers);
+ ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, normalizers);
if (toUrl == null) {
continue;
}
@@ -316,22 +318,22 @@
}
public static String filterNormalize(String fromUrl, String toUrl,
- String fromHost, boolean ignoreExternalLinks,
+ String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
String ignoreExternalLinksMode, URLFilters filters,
URLNormalizers normalizers) {
- return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
+ return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks,
ignoreExternalLinksMode, filters, normalizers,
URLNormalizers.SCOPE_OUTLINK);
}
public static String filterNormalize(String fromUrl, String toUrl,
- String origin, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters,
+ String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks, String ignoreExternalLinksMode, URLFilters filters,
URLNormalizers normalizers, String urlNormalizerScope) {
// ignore links to self (or anchors within the page)
if (fromUrl.equals(toUrl)) {
return null;
}
- if (ignoreExternalLinks) {
+ if (ignoreExternalLinks || ignoreInternalLinks) {
URL targetURL = null;
try {
targetURL = new URL(toUrl);
@@ -338,15 +340,30 @@
} catch (MalformedURLException e1) {
return null; // skip it
}
- if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
- String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
- if (toDomain == null || !toDomain.equals(origin)) {
- return null; // skip it
+ if (ignoreExternalLinks) {
+ if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+ String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+ if (toDomain == null || !toDomain.equals(origin)) {
+ return null; // skip it
+ }
+ } else {
+ String toHost = targetURL.getHost().toLowerCase();
+ if (toHost == null || !toHost.equals(origin)) {
+ return null; // skip it
+ }
}
- } else {
- String toHost = targetURL.getHost().toLowerCase();
- if (toHost == null || !toHost.equals(origin)) {
- return null; // skip it
+ }
+ if (ignoreInternalLinks) {
+ if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+ String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+ if (toDomain == null || toDomain.equals(origin)) {
+ return null; // skip it
+ }
+ } else {
+ String toHost = targetURL.getHost().toLowerCase();
+ if (toHost == null || toHost.equals(origin)) {
+ return null; // skip it
+ }
}
}
}
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1730536)
+++ conf/nutch-default.xml (working copy)
@@ -539,11 +539,11 @@
db.ignore.internal.links
- true
- If true, when adding new links to a page, links from
- the same host are ignored. This is an effective way to limit the
- size of the link database, keeping only the highest quality
- links.
+ false
+ If true, outlinks leading from a page to internal hosts or domain
+ will be ignored. This is an effective way to limit the crawl to include
+ only initially injected hosts, without creating complex URLFilters.
+ See 'db.ignore.external.links.mode'.
@@ -616,15 +616,6 @@
- db.max.inlinks
- 10000
- Maximum number of Inlinks per URL to be kept in LinkDb.
- If "invertlinks" finds more inlinks than this number, only the first
- N inlinks will be stored, and the rest will be discarded.
-
-
-
-
db.max.outlinks.per.page
100
The maximum number of outlinks that we'll process for a page.
@@ -681,6 +672,35 @@
+
+
+
+ linkdb.max.inlinks
+ 10000
+ Maximum number of Inlinks per URL to be kept in LinkDb.
+ If "invertlinks" finds more inlinks than this number, only the first
+ N inlinks will be stored, and the rest will be discarded.
+
+
+
+
+ linkdb.ignore.internal.links
+ true
+ If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping only the highest quality
+ links.
+
+
+
+
+ linkdb.ignore.external.links
+ false
+ If true, when adding new links to a page, links from
+ the a different host are ignored.
+
+
+