diff --git build.xml build.xml
index 269d460..0aba78b 100644
--- build.xml
+++ build.xml
@@ -178,6 +178,7 @@
+
@@ -584,6 +585,7 @@
+
@@ -969,6 +971,8 @@
+
+
diff --git conf/mimetype-filter.txt conf/mimetype-filter.txt
new file mode 100644
index 0000000..daf4593
--- /dev/null
+++ conf/mimetype-filter.txt
@@ -0,0 +1,20 @@
+# config file for mimetype-filter plugin
+
+# This plugin can be configured to work in one of two modes (similar to
+# suffix-urlfilter)
+
+# default to reject ('-'): in this mode, all documents will be rejected except
+# for those specified in this configuration file.
+
+# default to accept ('+'): in this mode, all documents will be accepted except
+# for those specified in this configuration file.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+
+# block everything
+-
+
+# allow only documents with a text/html mimetype
+text/html
diff --git conf/nutch-default.xml conf/nutch-default.xml
index a9a1956..77c20ff 100644
--- conf/nutch-default.xml
+++ conf/nutch-default.xml
@@ -1603,4 +1603,15 @@
Whether to support multivalued headings.
+
+
+
+ mimetype.filter.file
+ mimetype-filter.txt
+
+ The configuration file for the mimetype-filter plugin. This file contains
+ the rules used to allow or deny the indexing of certain documents.
+
+
+
diff --git default.properties default.properties
index f67e954..b17893c 100644
--- default.properties
+++ default.properties
@@ -148,6 +148,7 @@ plugins.index=\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
org.apache.nutch.indexer.geoip*:\
+ org.apache.nutch.indexer.filter*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.static*:\
diff --git src/plugin/build.xml src/plugin/build.xml
index 7c6a5bc..d6ff755 100644
--- src/plugin/build.xml
+++ src/plugin/build.xml
@@ -36,6 +36,7 @@
+
@@ -89,6 +90,7 @@
+
@@ -127,10 +129,11 @@
-
+
+
diff --git src/plugin/mimetype-filter/build.xml src/plugin/mimetype-filter/build.xml
new file mode 100644
index 0000000..522a009
--- /dev/null
+++ src/plugin/mimetype-filter/build.xml
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/mimetype-filter/ivy.xml src/plugin/mimetype-filter/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ src/plugin/mimetype-filter/ivy.xml
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/mimetype-filter/plugin.xml src/plugin/mimetype-filter/plugin.xml
new file mode 100644
index 0000000..d038447
--- /dev/null
+++ src/plugin/mimetype-filter/plugin.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/mimetype-filter/sample/allow-images.txt src/plugin/mimetype-filter/sample/allow-images.txt
new file mode 100644
index 0000000..0f5f136
--- /dev/null
+++ src/plugin/mimetype-filter/sample/allow-images.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image
diff --git src/plugin/mimetype-filter/sample/block-html.txt src/plugin/mimetype-filter/sample/block-html.txt
new file mode 100644
index 0000000..69600ec
--- /dev/null
+++ src/plugin/mimetype-filter/sample/block-html.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file
diff --git src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
new file mode 100644
index 0000000..63d43d5
--- /dev/null
+++ src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+ public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MimeTypeIndexingFilter.class);
+
+ private MimeUtil MIME;
+ private Tika tika = new Tika();
+
+ private TrieStringMatcher trie;
+
+ private Configuration conf;
+
+ private boolean acceptMode = true;
+
+ // Inherited JavaDoc
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ String mimeType;
+ String contentType;
+
+ Writable tcontentType = datum.getMetaData()
+ .get(new Text(Response.CONTENT_TYPE));
+
+ if (tcontentType != null) {
+ contentType = tcontentType.toString();
+ } else {
+ contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+ }
+
+ if (contentType == null) {
+ mimeType = tika.detect(url.toString());
+ } else {
+ mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+ }
+
+ contentType = mimeType;
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info(String.format("[%s] %s", contentType, url));
+ }
+
+ if (null != trie) {
+ if (trie.shortestMatch(contentType) == null) {
+ // no match, but
+ if (acceptMode) {
+ return doc;
+ }
+ return null;
+ } else {
+ // matched, but we are blocking
+ if (acceptMode) {
+ return null;
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ /*
+ * -----------------------------
+ * *
+ * -----------------------------
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ MIME = new MimeUtil(conf);
+
+ // load the file of the values
+ String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+ if (file != null) {
+ if (file.isEmpty()) {
+ LOG.warn(String
+ .format("Missing %s property, ALL mimetypes will be allowed",
+ MIMEFILTER_REGEX_FILE));
+ } else {
+ Reader reader = conf.getConfResourceAsReader(file);
+
+ try {
+ readConfiguration(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+ }
+
+ private void readConfiguration(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ String line;
+ List rules = new ArrayList();
+
+ while (null != (line = in.readLine())) {
+ if (line.length() == 0) {
+ continue;
+ }
+
+ char first = line.charAt(0);
+ switch (first) {
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ break;
+ case '+':
+ acceptMode = true;
+ break;
+ case '-':
+ acceptMode = false;
+ break;
+ default:
+ rules.add(line);
+ break;
+ }
+ }
+
+ trie = new PrefixStringMatcher(rules);
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+ /*
+ * ------------------------------ * *
+ * ------------------------------
+ */
+}
+
diff --git src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
new file mode 100644
index 0000000..bca230f
--- /dev/null
+++ src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+ private Configuration conf = NutchConfiguration.create();
+ private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+ private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+ private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ @Before
+ public void setUp() throws Exception {
+ for (int i = 0; i < MIME_TYPES.length; i++) {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+ ParseImpl parse = new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+ parses[i] = parse;
+ }
+ }
+
+ @Test
+ public void testMissingConfigFile() throws Exception {
+ String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+ Assert.assertEquals(String
+ .format("Property %s must not be present in the the configuration file",
+ MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+ filter.setConf(conf);
+
+ // property not set so in this cases all documents must pass the filter
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertNotNull("All documents must be allowed by default", doc);
+ }
+ }
+
+ @Test
+ public void testAllowOnlyImages() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("image")) {
+ Assert.assertNotNull("Allow only images", doc);
+ } else {
+ Assert.assertNull("Block everything else", doc);
+ }
+ }
+ }
+
+ @Test
+ public void testBlockHTML() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("html")) {
+ Assert.assertNull("Block only HTML documents", doc);
+ } else {
+ Assert.assertNotNull("Allow everything else", doc);
+ }
+ }
+ }
+}