Index: ivy/ivy.xml
===================================================================
--- ivy/ivy.xml (revision 1350539)
+++ ivy/ivy.xml (working copy)
@@ -55,7 +55,7 @@
-
+
Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 1350539)
+++ CHANGES.txt (working copy)
@@ -3,6 +3,8 @@
Release 2.0 (08/06/2012) ddmmyyy
Full Jira report - https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=10680&version=12314893
+* NUTCH-1396 Upgrade Tika 1.1 (jnioche)
+
* NUTCH-1392 -force and -resume arguments being ignored in ParserJob (ferdy via lewismc)
* NUTCH-1379 NPE when reprUrl is null in ParseUtil (ferdy)
Index: src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
===================================================================
--- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (revision 1350539)
+++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (working copy)
@@ -242,13 +242,13 @@
page.setBaseUrl(new Utf8(url));
page.setContent(ByteBuffer.wrap(bytes));
MimeUtil mimeutil = new MimeUtil(conf);
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
// Parse parse = parser.getParse(url, page);
Parse parse = new ParseUtil(conf).parse(url, page);
- System.out.println("content type: " + mtype.getName());
+ System.out.println("content type: " + mtype);
System.out.println("title: " + parse.getTitle());
System.out.println("text: " + parse.getText());
System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
Index: src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
===================================================================
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (revision 1350539)
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (working copy)
@@ -106,8 +106,8 @@
WebPage page = new WebPage();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
Index: src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
===================================================================
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java (revision 1350539)
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java (working copy)
@@ -102,8 +102,8 @@
WebPage page = new WebPage();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
Index: src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
===================================================================
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (revision 1350539)
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (working copy)
@@ -81,8 +81,8 @@
WebPage page = new WebPage();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
Index: src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
===================================================================
--- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (revision 1350539)
+++ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (working copy)
@@ -82,8 +82,8 @@
page.setContent(ByteBuffer.wrap(bytes));
// set the content type?
MimeUtil mimeutil = new MimeUtil(conf);
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse("file:"+urlString, page);
return parse.getText();
Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (revision 1350539)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (working copy)
@@ -17,16 +17,12 @@
package org.apache.nutch.indexer.more;
import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
-import java.util.TimeZone;
import org.apache.avro.util.Utf8;
import org.apache.commons.lang.time.DateUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
@@ -42,8 +38,9 @@
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.tika.mime.MimeType;
import org.apache.solr.common.util.DateUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Add (or reset) a few metaData properties as respective fields (if they are
@@ -170,7 +167,7 @@
* @return
*/
private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
- MimeType mimeType = null;
+ String mimeType = null;
Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
if (contentType == null) {
// Note by Jerome Charron on 20050415:
@@ -194,9 +191,9 @@
return doc;
}
- String scontentType = mimeType.getName();
+ //String scontentType = mimeType.getName();
- doc.add("type", scontentType);
+ doc.add("type", mimeType);
// Check if we need to split the content type in sub parts
if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (revision 1350539)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java (working copy)
@@ -209,8 +209,8 @@
headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
.lastModified()));
- MimeType mimeType = MIME.getMimeType(f);
- String mimeTypeString = mimeType != null ? mimeType.getName() : "";
+ String mimeType = MIME.getMimeType(f);
+ String mimeTypeString = mimeType != null ? mimeType.toString() : "";
headers.set(Response.CONTENT_TYPE, mimeTypeString);
// response code
Index: src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
===================================================================
--- src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (revision 1350539)
+++ src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (working copy)
@@ -73,8 +73,8 @@
page.setBaseUrl(new Utf8(url));
page.setContent(ByteBuffer.wrap(bytes));
MimeUtil mimeutil = new MimeUtil(conf);
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
new ParseUtil(conf).parse(url, page);
Index: src/java/org/apache/nutch/util/MimeUtil.java
===================================================================
--- src/java/org/apache/nutch/util/MimeUtil.java (revision 1350539)
+++ src/java/org/apache/nutch/util/MimeUtil.java (working copy)
@@ -24,13 +24,19 @@
import org.apache.hadoop.conf.Configuration;
// Tika imports
+import org.apache.tika.Tika;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput;
+
/**
* @author mattmann
* @since NUTCH-608
@@ -49,6 +55,9 @@
/* our Tika mime type registry */
private MimeTypes mimeTypes;
+ /* the tika detectors */
+ private Tika tika;
+
/* whether or not magic should be employed or not */
private boolean mimeMagic;
@@ -56,6 +65,7 @@
private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName());
public MimeUtil(Configuration conf) {
+ tika = new Tika();
ObjectCache objectCache = ObjectCache.get(conf);
MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
.getName());
@@ -118,7 +128,7 @@
* typeName
is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
* {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
- * found, then that mime type is used, otherwise {@link URL} resolution is
+ * found, then that mime type is used, otherwise URL resolution is
* used to try and determine the mime type. If that means is unsuccessful, and
* if mime.type.magic
is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
@@ -127,12 +137,14 @@
* @param typeName
* The original mime type, returned from a {@link ProtocolOutput}.
* @param url
- * The given {@link URL}, that Nutch was trying to crawl.
+ * The given @see url, that Nutch was trying to crawl.
* @param data
* The byte data, returned from the crawl, if any.
* @return The correctly, automatically guessed {@link MimeType} name.
*/
public String autoResolveContentType(String typeName, String url, byte[] data) {
+ String retType = null;
+ String magicType = null;
MimeType type = null;
String cleanedMimeType = null;
@@ -161,59 +173,65 @@
.getMimeType(url) : type;
}
+ retType= type.getName();
+
// if magic is enabled use mime magic to guess if the mime type returned
// from the magic guess is different than the one that's already set so far
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
- MimeType magicType = this.mimeTypes.getMimeType(data);
- if (magicType != null && !magicType.getName().equals(MimeTypes.OCTET_STREAM)
- && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
- && type != null && !type.getName().equals(magicType.getName())) {
+ magicType = tika.detect(data);
+
+ // Deprecated in Tika 1.0 See https://issues.apache.org/jira/browse/NUTCH-1230
+ //MimeType magicType = this.mimeTypes.getMimeType(data);
+ if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+ && !magicType.equals(MimeTypes.PLAIN_TEXT)
+ && retType != null && !retType.equals(magicType)) {
+
// If magic enabled and the current mime type differs from that of the
// one returned from the magic, take the magic mimeType
- type = magicType;
+ retType = magicType;
}
// if type is STILL null after all the resolution strategies, go for the
// default type
- if (type == null) {
+ if (retType == null) {
try {
- type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
+ retType = MimeTypes.OCTET_STREAM;
} catch (Exception ignore) {
}
}
}
- return type.getName();
+ return retType;
}
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
* method.
- *
+ *
* @param url
* A string representation of the document {@link URL} to sense the
* {@link MimeType} for.
* @return An appropriate {@link MimeType}, identified from the given
* Document url in string form.
*/
- public MimeType getMimeType(String url) {
- return this.mimeTypes.getMimeType(url);
+ public String getMimeType(String url) {
+ return tika.detect(url);
}
/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* method.
- *
+ *
* @param name
* The name of a valid {@link MimeType} in the Tika mime registry.
* @return The object representation of the {@link MimeType}, if it exists,
* or null otherwise.
*/
- public MimeType forName(String name) {
+ public String forName(String name) {
try {
- return this.mimeTypes.forName(name);
+ return this.mimeTypes.forName(name).toString();
} catch (MimeTypeException e) {
LOG.error("Exception getting mime type by name: [" + name
+ "]: Message: " + e.getMessage());
@@ -224,14 +242,21 @@
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* method.
- *
+ *
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
* cannot be determined.
*/
- public MimeType getMimeType(File f) {
- return this.mimeTypes.getMimeType(f);
+ public String getMimeType(File f) {
+ try {
+ return tika.detect(f);
+ } catch (Exception e) {
+ LOG.error("Exception getting mime type for file: [" + f.getPath()
+ + "]: Message: " + e.getMessage());
+ return null;
+ }
}
+
}