Index: src/plugin/urlfilter-db/src/java/org/apache/nutch/net/DbURLFilter.java
===================================================================
--- src/plugin/urlfilter-db/src/java/org/apache/nutch/net/DbURLFilter.java	(revision 0)
+++ src/plugin/urlfilter-db/src/java/org/apache/nutch/net/DbURLFilter.java	(revision 0)
@@ -0,0 +1,337 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.LogFormatter;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.io.MD5Hash;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.logging.Logger;
+
+import net.sf.swarmcache.*;
+
+import java.sql.*;
+import java.net.URL;
+
+/**
+ * Filters URLs based on existance in the database table.
+ * The purpose of this plugin is to be used by SE who might want to use
+ * the whole web concept but to limit the fetch to only certain domains.
+ * If a domain url http://www.domain.tld exists in the db the filter
+ * will return the url (fetch it). to extract the protocol and host I use
+ * the java.net.URL object so any url must conform to the protocol...
+ * Make sure the urls in the database does not have a trailing / i.e.
+ * http://www.domain.com and not http://www.domain.com/
+ *
+ * This plugin uses caching system to make the db access as limited as possible.
+ * When a request arrives to the filter, the filter make a hash of the protocol + domain
+ * and check the cache for existance of that hash.
+ * If the hash doesn't exist, the filter will try to get it from the database.
+ * If exists in the database, it will hash it and put it in the cache. and return true (exists).
+ * If doesn't exist return false.
+ *
+ * The cache default is "urlfilter.db.elements" = 100000 elements in cache (about 600KB)
+ * Set it to your needs.
+ *
+ * "CREATE TABLE urls (URL varchar(80));"
+ *
+ * The table name (default to "urls") should be specified at
+ * (1) property "urlfilter.db.table" in ./conf/nutch-default.xml, and
+ * (2) attribute "table" in plugin.xml of this plugin
+ * Attribute "table" has higher precedence if defined.
+ * 
+ * The column name in the table should be specified at
+ * (3) property "urlfilter.db.column" in ./conf/nutch-default.xml, and
+ * (4) attribute "column" in plugin.xml of this plugin
+ * Attribute "column" has higher precedence if defined.
+ *
+ * The database JDBC driver to be used (jar must be in the path) i.e.:
+ * com.mysql.jdbc.Driver
+ * (5) property "urlfilter.db.driver" in ./conf/nutch-default.xml, and
+ * (6) attribute "driver" in plugin.xml of this plugin
+ *
+ * The database JDBC connection definition i.e.:
+ * jdbc:mysql://localhost/nutch?user=username&password=password
+ * (7) property "urlfilter.db.connection" in ./conf/nutch-default.xml, and
+ * (8) attribute "connection" in plugin.xml of this plugin
+ *
+ * The number of objects to cache
+ * (9) property "urlfilter.db.elements" in ./conf/nutch-default.xml, and
+ * (10) attribute "elements" in plugin.xml of this plugin
+ *
+ */
+
+public class DbURLFilter implements URLFilter {
+  private HybridCache domainCache;
+  private String SQL = null;
+  private static Connection conn = null;
+  private static Statement stmt = null;
+
+  private static final Logger LOG =
+    LogFormatter.getLogger(DbURLFilter.class.getName());
+
+  // read in attributes of this plugin.
+  private static String driver = null;
+  private static String column = null;
+  private static String table = null;
+  private static String connection = null;
+  private static int elements = 0;
+  static {
+    String pluginName = "urlfilter-db";
+    Extension[] extensions = PluginRepository.getInstance()
+      .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i=0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        driver = extension.getAttribute("driver");
+        column = extension.getAttribute("column");
+        table = extension.getAttribute("table");
+        connection = extension.getAttribute("connection");
+        try{
+          elements = Integer.parseInt(extension.getAttribute("elements"));
+        } catch (NumberFormatException nfe) {
+          // don't care
+        }
+        break;
+      }
+    }
+    if (driver != null && driver.trim().equals(""))
+      driver = null;
+    if (driver != null) {
+      LOG.info("Attribute \"driver\" is defined for plugin "+pluginName+" as "+driver);
+    } else {
+      //LOG.warning("Attribute \"driver\" is not defined in plugin.xml for plugin "+pluginName);
+    }
+
+    if (column != null && column.trim().equals(""))
+      column = null;
+    if (column != null) {
+      LOG.info("Attribute \"column\" is defined for plugin "+pluginName+" as "+column);
+    } else {
+      //LOG.warning("Attribute \"column\" is not defined in plugin.xml for plugin "+pluginName);
+    }
+
+    if (table != null && table.trim().equals(""))
+      table = null;
+    if (table != null) {
+      LOG.info("Attribute \"table\" is defined for plugin "+pluginName+" as "+table);
+    } else {
+      //LOG.warning("Attribute \"table\" is not defined in plugin.xml for plugin "+pluginName);
+    }
+
+    if (connection != null && connection.trim().equals(""))
+      connection = null;
+    if (connection != null) {
+      LOG.info("Attribute \"connection\" is defined for plugin "+pluginName+" as "+connection);
+    } else {
+      //LOG.warning("Attribute \"connection\" is not defined in plugin.xml for plugin "+pluginName);
+    }
+
+    if (elements != 0) {
+      LOG.info("Attribute \"elements\" is defined for plugin "+pluginName+" as "+elements);
+    } else {
+      //LOG.warning("Attribute \"elements\" is not defined in plugin.xml for plugin "+pluginName);
+    }
+  }
+
+  public DbURLFilter() throws IOException, ClassNotFoundException, 
+	  InstantiationException, IllegalAccessException, SQLException {
+    String adriver = NutchConf.get().get("urlfilter.db.driver");
+    String acolumn = NutchConf.get().get("urlfilter.db.column");
+    String atable = NutchConf.get().get("urlfilter.db.table");
+    String aconnection = NutchConf.get().get("urlfilter.db.connection");
+    int aelements = NutchConf.get().getInt("urlfilter.db.elements", 100000);
+
+    LOG.warning("urlfilter-db: new instance");
+    // attributes takes precedence if defined
+    if (driver != null) adriver = driver;
+    if (column != null) acolumn = column;
+    if (table != null) atable = table;
+    if (connection != null) aconnection = connection;
+    if (elements != 0) aelements = elements;
+	LOG.warning("Attribute \"driver\"=" + adriver);
+	LOG.warning("Attribute \"column\"=" + acolumn);
+	LOG.warning("Attribute \"table\"=" + atable);
+	LOG.warning("Attribute \"connection\"=" + aconnection);
+	LOG.warning("Attribute \"elements\"=" + aelements);
+
+
+    SQL = "SELECT " + acolumn + " FROM " + atable + " WHERE " + acolumn + " LIKE ";
+
+    // create a new cache
+    domainCache = new HybridCache();
+    // set number of elements this cache contains
+    domainCache.setSize(aelements);
+    // create jdbc driver
+    Class.forName(adriver).newInstance();
+    // connect to db
+    conn = DriverManager.getConnection(aconnection);
+    // create a statement we are going to use only one stmt.
+    stmt = conn.createStatement();
+  }
+
+  // the one and only filter entry...
+  public synchronized String filter(String url) {
+    //LOG.warning("urlfilter-db: filter: " + url);
+    if (exists(url)) {
+      LOG.info("urlfilter-db: allowed " + url);
+      return url;
+    }
+    LOG.info("urlfilter-db: not allowed " + url);    
+    return null;   // assume no go
+  }
+
+  // get the domain part of a url
+  protected String getDomain(String url) {
+    String ret = null;
+    URL u;
+
+    try {
+      u = new URL(url);
+      ret = u.getProtocol() + "://" + u.getHost();
+    } catch (java.net.MalformedURLException e) {
+    }
+    return ret;
+  }
+
+  // make a hash from a domain string
+  protected MD5Hash getHash(String domain) {
+    MD5Hash ret = null;
+
+    ret = MD5Hash.digest(domain);
+
+    return ret;
+  }
+
+    /**
+     * Check if a domain exists in database
+     * format must be protocol://domain.tld
+     * with or without any trailing directories or filenames or query info
+     * retur true if exists, false if it doesn't.
+     * if exists in the database, will put it in the cache for faster
+     * retrieval
+     */
+    public boolean exists(String url)
+    {
+        String vObj = null;
+        MD5Hash id = null;
+
+        // get only the protocol + host part from the url
+        String domain = getDomain(url);
+
+        if (domain == null) {
+            // this is not a legal url
+            return false;
+        }
+
+        // build a hash obj for this domain
+        id = getHash(domain);
+
+        if (id == null) {
+            // somthing bad happend
+            return false;
+        }
+
+        LOG.info("Looking in cache for: "+"T"+id.hashCode());
+        // First, if requested, attempt to load from cache
+        vObj = (String) domainCache.get("T"+id.hashCode());
+
+        if (vObj == null) {
+            // the object was not found, so call loadvObj to get it from database
+            LOG.info("Looking in database for: "+domain);
+            if(!loadvObj(domain)) {
+                // the object was not found in the db
+                LOG.info("Not in database: "+domain);
+                return false;
+            }
+            LOG.info("Found in database: "+domain);
+        }
+
+        // return what we found. if not found null.
+        return  true;
+    }
+
+    protected boolean loadvObj(String domain)
+    {
+        boolean ret = false;
+        ResultSet rs = null;
+        String found = null;
+
+        try {
+            rs = stmt.executeQuery(SQL + "'" + domain + "'");
+
+            if (rs.first()) {
+                URL u = rs.getURL("URL");
+                found = u.getProtocol() + "://" + u.getHost();
+            }
+        } catch (java.sql.SQLException se) {
+            //System.out.println(se.getMessage());
+            LOG.warning("error executing statement: "+SQL+domain);
+        } finally {
+
+            if (rs != null) {
+                try {
+                    rs.close();
+                } catch (SQLException sqlEx) {
+                    // ignore
+                }
+
+                rs = null;
+            }
+        }
+
+        if (found != null && found.length() != 0) {
+            MD5Hash hash;
+            try {
+                hash = getHash(found);
+            } catch (Exception e) {
+                return ret;
+            }
+
+            try {
+                // put it in the cache
+                domainCache.put("T"+hash.hashCode(), "T");
+                LOG.info("Success rntering value T"+hash.hashCode()+" to Cache for: "+domain);
+            } catch (java.lang.Exception e) {
+                // see what we can do about it later
+                LOG.info("Can not put url in Cache: "+found);
+            }
+            ret = true;
+        }
+
+        return ret;
+    }
+
+  public static void main(String args[])
+    throws IOException, ClassNotFoundException, 
+	  InstantiationException, IllegalAccessException, SQLException {
+
+    DbURLFilter filter=new DbURLFilter();
+  }
+}
Index: src/plugin/urlfilter-db/src/java/org/apache/nutch/net/package.html
===================================================================
--- src/plugin/urlfilter-db/src/java/org/apache/nutch/net/package.html	(revision 0)
+++ src/plugin/urlfilter-db/src/java/org/apache/nutch/net/package.html	(revision 0)
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A database url filter plugin.</p><p></p>
+</body>
+</html>
Index: src/plugin/urlfilter-db/plugin.xml
===================================================================
--- src/plugin/urlfilter-db/plugin.xml	(revision 0)
+++ src/plugin/urlfilter-db/plugin.xml	(revision 0)
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="urlfilter-db"
+   name="DB URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-db.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfiler"
+              name="Nutch DB URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DbURLFilter"
+                      class="org.apache.nutch.net.DbURLFilter"/>
+   </extension>
+
+</plugin>
Index: src/plugin/urlfilter-db/build.xml
===================================================================
--- src/plugin/urlfilter-db/build.xml	(revision 0)
+++ src/plugin/urlfilter-db/build.xml	(revision 0)
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="urlfilter-db" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 307374)
+++ src/plugin/build.xml	(working copy)
@@ -30,6 +30,7 @@
      <ant dir="query-site" target="deploy"/>
      <ant dir="query-url" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
+     <ant dir="urlfilter-db" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="creativecommons" target="deploy"/>
      <ant dir="languageidentifier" target="deploy"/>
@@ -84,6 +85,7 @@
     <ant dir="query-site" target="clean"/>
     <ant dir="query-url" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
+    <ant dir="urlfilter-db" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
     <ant dir="languageidentifier" target="clean"/>
