Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1235719)
+++ conf/nutch-default.xml (working copy)
@@ -794,7 +794,7 @@
-
+
moreIndexingFilter.indexMimeTypeParts
@@ -805,6 +805,15 @@
+
+ moreIndexingFilter.mimeTypeSource
+
+ Determines the MetaData source to read the MIME-type from. This can be
+ either parsemeta or contentmeta. No value will first attempt to read from parsemeta and
+ then from contentmeta. This is the default behavior.
+
+
+
Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (revision 1235719)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (working copy)
@@ -186,7 +186,20 @@
*/
private NutchDocument addType(NutchDocument doc, ParseData data, String url) {
String mimeType = null;
- String contentType = data.getMeta(Response.CONTENT_TYPE);
+ String contentType = null;
+
+ // Select the source of the content-type
+ if (conf.get("moreIndexingFilter.mimeTypeSource", "").equals("parsemeta")) {
+ // Read from ParseMeta only
+ contentType = data.getParseMeta().get(Response.CONTENT_TYPE);
+ } else if (conf.get("moreIndexingFilter.mimeTypeSource", "").equals("contentmeta")) {
+ // Read from ContentMeta only
+ contentType = data.getContentMeta().get(Response.CONTENT_TYPE);
+ } else {
+ // Read from ParseMeta first and fall back to ContenMeta
+ contentType = data.getMeta(Response.CONTENT_TYPE);
+ }
+
if (contentType == null) {
// Note by Jerome Charron on 20050415:
// Content Type not solved by a previous plugin