package dk.binaryconstructors.nutchindexer.nutch.plugins;
import java.io.BufferedReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
/** A replacement for the buggy org.apache.nutch.urlfilter.domain.DomainURLFilter.
*
* Loads a list of domains from a file on the classpath (the file path is specified
* in the Nutch configuration by the property "urlfilter.domain.file"). If an input
* URL doesn't match any of the domains, or if it is malformed, then it is rejected.
*
* Specifying "org" in the domain list will match all hostnames with the TLD ".org";
* specifying "apache.org" will match "apache.org", "www.apache.org", "lucene.apache.org",
* etc.
*
* The manifest for this plugin is in
* src/main/resources/default/plugins/binaryconstructors-core/plugin.xml
*
* @author Mike Baranczak
*/
public class DomainURLFilter implements URLFilter {
public static final String NUTCH_CONF_DOMAIN_FILE = "urlfilter.domain.file";
private static Log log = LogFactory.getLog(DomainURLFilter.class);
private Configuration conf;
private String domainFile;
private Set domainSet;
private void initDomainSet() {
// read the domain file, line by line
try {
domainSet = new HashSet();
if (domainFile == null) {
setDomainFile(conf.get(NUTCH_CONF_DOMAIN_FILE));
}
log.info("reading domain file at "+domainFile);
BufferedReader reader = new BufferedReader(conf.getConfResourceAsReader(domainFile));
String line = null;
while ((line = reader.readLine()) != null) {
line = StringUtils.strip(line);
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
// add non-blank lines and non-commented lines
line = StringUtils.lowerCase(line);
log.debug(line);
domainSet.add(line);
}
}
} catch (Exception ex) {
log.error("error initializing filter!", ex);
}
if (domainSet.isEmpty()) {
log.warn("the list of allowed domains is empty - crawling will probably fail!");
}
}
/** Only exposed for the purpose of unit testing. */
protected void setDomainFile(String domainFile) {
this.domainFile = domainFile;
}
/** Return null if the URL is to be rejected, otherwise return the URL unchanged. */
@Override
public String filter(String urlString) {
//log.debug("filtering: " + urlString);
if (domainSet == null) {
initDomainSet();
}
if (StringUtils.isBlank(urlString)) {
log.debug("rejecting blank URL");
return null;
}
URL url;
try {
url = new URL(urlString);
} catch (MalformedURLException ex) {
if (log.isDebugEnabled()) {
if (log.isDebugEnabled()) log.debug("rejecting malformed URL: "+urlString);
}
return null;
}
// check URL's host against all allowed domains
Iterator iter = domainSet.iterator();
while (iter.hasNext()) {
if (matches(url, iter.next())) {
return urlString;
}
}
return null;
}
private boolean matches(URL url, String host) {
String urlHost = url.getHost();
if (urlHost == null) {
return false;
}
urlHost = StringUtils.lowerCase(urlHost);
if (urlHost.equals(host)) {
return true;
}
if (urlHost.endsWith("." + host)) {
return true;
}
return false;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
}