Index: CHANGES.txt
===================================================================
--- CHANGES.txt	(revision 620222)
+++ CHANGES.txt	(working copy)
@@ -202,6 +202,8 @@
 
 71. NUTCH-607 - Update build.xml to include tika jar when building war (kubes)
 
+72. NUTCH-608 - Upgrade nutch to use released apache-tika-0.1-incubating (mattmann)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline
Index: lib/tika-0.1-dev.jar
===================================================================
Cannot display: file marked as binary type.
svn:mime-type = application/octet-stream
Index: lib/tika-0.1-incubating.jar
===================================================================
Cannot display: file marked as binary type.
svn:mime-type = application/octet-stream

Property changes on: lib/tika-0.1-incubating.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Index: src/java/org/apache/nutch/parse/ParserFactory.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserFactory.java	(revision 620222)
+++ src/java/org/apache/nutch/parse/ParserFactory.java	(working copy)
@@ -36,12 +36,9 @@
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.ObjectCache;
 
-// Tika imports
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-
 
 /** Creates and caches {@link Parser} plugins.*/
 public final class ParserFactory {
@@ -222,15 +219,8 @@
     ObjectCache objectCache = ObjectCache.get(conf);
     // First of all, tries to clean the content-type
     String type = null;
-    try {
-      type = MimeType.clean(contentType);
-    } catch (MimeTypeException mte) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Could not clean the content-type [" + contentType +
-                  "], Reason is [" + mte + "]. Using its raw version...");
-      }
-      type = contentType;
-    }
+    type = MimeUtil.cleanMimeType(contentType);
+
 
     List<Extension> extensions = (List<Extension>) objectCache.getObject(type);
 
Index: src/java/org/apache/nutch/protocol/Content.java
===================================================================
--- src/java/org/apache/nutch/protocol/Content.java	(revision 620222)
+++ src/java/org/apache/nutch/protocol/Content.java	(working copy)
@@ -38,14 +38,9 @@
 
 //Nutch imports
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
-//Tika imports
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeUtils;
-
-
 public final class Content implements Writable{
 
   public static final String DIR_NAME = "content";
@@ -64,9 +59,7 @@
 
   private Metadata metadata;
 
-  private boolean mimeTypeMagic;
-
-  private static MimeUtils mimeTypes;
+  private static MimeUtil mimeTypes;
 
   public Content() {
     metadata = new Metadata();
@@ -88,9 +81,9 @@
     this.base = base;
     this.content = content;
     this.metadata = metadata;
-    this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
+
     if(this.mimeTypes == null){
-      this.mimeTypes = new MimeUtils(conf.get("mime.types.file"), this.mimeTypeMagic);
+      this.mimeTypes = new MimeUtil(conf);
     }
     this.contentType = getContentType(contentType, url, content);
   }
@@ -289,41 +282,7 @@
   }
 
   private String getContentType(String typeName, String url, byte[] data) {
-    MimeType type = null;
-    String cleanedMimeType = null;
-
-    try {
-      cleanedMimeType = MimeType.clean(typeName);
-    } catch (MimeTypeException mte) {
-      // Seems to be a malformed mime type name...
-    }
-
-    // first try to get the type from the cleaned type name
-    type = cleanedMimeType != null ? this.mimeTypes.getRepository().forName(
-        cleanedMimeType) : null;
-
-    // if returned null, then try url resolution
-    if (type == null) {
-      // If no mime-type header, or cannot find a corresponding registered
-      // mime-type, then guess a mime-type from the url pattern
-      type = this.mimeTypes.getRepository().getMimeType(url) != null ? this.mimeTypes
-          .getRepository().getMimeType(url)
-          : type;
-    }
-
-    // if magic is enabled use mime magic to guess if the mime type returned
-    // from the magic guess is different than the one that's already set so far
-    // if it is, go with the mime type returned by the magic
-    if (this.mimeTypeMagic) {
-      MimeType magicType = this.mimeTypes.getRepository().getMimeType(data);
-      if (magicType != null && !type.getName().equals(magicType.getName())) {
-        // If magic enabled and the current mime type differs from that of the
-        // one returned from the magic, take the magic mimeType
-
-        type = magicType;
-      }
-    }
+    return this.mimeTypes.autoResolveContentType(typeName, url, data);
+  }
 
-    return type.getName();
-  }
 }
Index: src/java/org/apache/nutch/util/MimeUtil.java
===================================================================
--- src/java/org/apache/nutch/util/MimeUtil.java	(revision 0)
+++ src/java/org/apache/nutch/util/MimeUtil.java	(revision 0)
@@ -0,0 +1,212 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+// JDK imports
+import java.io.File;
+import java.util.logging.Logger;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Tika imports
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+
+/**
+ * @author mattmann
+ * @since NUTCH-608
+ * 
+ * <p>
+ * This is a facade calss to insulate Nutch from its underlying Mime Type
+ * substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>.
+ * Any mime handling code should be placed in this utility class, and hidden
+ * from the Nutch classes that rely on it.
+ * </p>
+ */
+public final class MimeUtil {
+
+  private static final String SEPARATOR = ";";
+
+  /* our Tika mime type registry */
+  private MimeTypes mimeTypes;
+
+  /* whether or not magic should be employed or not */
+  private boolean mimeMagic;
+
+  /* our log stream */
+  private static final Logger LOG = Logger.getLogger(MimeUtil.class.getName());
+
+  public MimeUtil(Configuration conf) {
+    this.mimeTypes = MimeTypesFactory.create(conf
+        .getConfResourceAsInputStream(conf.get("mime.types.file")));
+    this.mimeMagic = conf.getBoolean("mime.type.magic", true);
+  }
+
+  /**
+   * Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
+   * from a string of the form:
+   * 
+   * <pre>
+   *      &lt;primary type&gt;/&lt;sub type&gt; ; &lt; optional params
+   * </pre>
+   * 
+   * @param origType
+   *          The original mime type string to be cleaned.
+   * @return The primary type, and subtype, concatenated, e.g., the actual mime
+   *         type.
+   */
+  public static String cleanMimeType(String origType) {
+    if (origType == null)
+      return null;
+
+    // take the origType and split it on ';'
+    String[] tokenizedMimeType = origType.split(SEPARATOR);
+    if (tokenizedMimeType.length > 1) {
+      // there was a ';' in there, take the first value
+      return tokenizedMimeType[0];
+    } else {
+      // there wasn't a ';', so just return the orig type
+      return origType;
+    }
+  }
+
+  /**
+   * A facade interface to trying all the possible mime type resolution
+   * strategies available within Tika. First, the mime type provided in
+   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
+   * Then the cleaned mime type is looked up in the underlying Tika
+   * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
+   * found, then that mime type is used, otherwise {@link URL} resolution is
+   * used to try and determine the mime type. If that means is unsuccessful, and
+   * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
+   * then mime type magic resolution is used to try and obtain a
+   * better-than-the-default approximation of the {@link MimeType}.
+   * 
+   * @param typeName
+   *          The original mime type, returned from a {@link ProtocolOutput}.
+   * @param url
+   *          The given {@link URL}, that Nutch was trying to crawl.
+   * @param data
+   *          The byte data, returned from the crawl, if any.
+   * @return The correctly, automatically guessed {@link MimeType} name.
+   */
+  public String autoResolveContentType(String typeName, String url, byte[] data) {
+    MimeType type = null;
+    String cleanedMimeType = null;
+
+    try {
+      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
+          .forName(MimeUtil.cleanMimeType(typeName)).getName()
+          : null;
+    } catch (MimeTypeException mte) {
+      // Seems to be a malformed mime type name...
+    }
+
+    // first try to get the type from the cleaned type name
+    try {
+      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
+          : null;
+    } catch (MimeTypeException e) {
+      type = null;
+    }
+
+    // if returned null, or if it's the default type then try url resolution
+    if (type == null
+        || (type != null && type.getName().equals(MimeTypes.DEFAULT))) {
+      // If no mime-type header, or cannot find a corresponding registered
+      // mime-type, then guess a mime-type from the url pattern
+      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
+          .getMimeType(url) : type;
+    }
+
+    // if magic is enabled use mime magic to guess if the mime type returned
+    // from the magic guess is different than the one that's already set so far
+    // if it is, and it's not the default mime type, then go with the mime type
+    // returned by the magic
+    if (this.mimeMagic) {
+      MimeType magicType = this.mimeTypes.getMimeType(data);
+      if (magicType != null && !magicType.getName().equals(MimeTypes.DEFAULT)
+          && type != null && !type.getName().equals(magicType.getName())) {
+        // If magic enabled and the current mime type differs from that of the
+        // one returned from the magic, take the magic mimeType
+        type = magicType;
+      }
+
+      // if type is STILL null after all the resolution strategies, go for the
+      // default type
+      if (type == null) {
+        try {
+          type = this.mimeTypes.forName(MimeTypes.DEFAULT);
+        } catch (Exception ignore) {
+        }
+      }
+    }
+
+    return type.getName();
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
+   * method.
+   * 
+   * @param url
+   *          A string representation of the document {@link URL} to sense the
+   *          {@link MimeType} for.
+   * @return An appropriate {@link MimeType}, identified from the given
+   *         Document url in string form.
+   */
+  public MimeType getMimeType(String url) {
+    return this.mimeTypes.getMimeType(url);
+  }
+
+  /**
+   * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
+   * method.
+   * 
+   * @param name
+   *          The name of a valid {@link MimeType} in the Tika mime registry.
+   * @return The object representation of the {@link MimeType}, if it exists,
+   *         or null otherwise.
+   */
+  public MimeType forName(String name) {
+    try {
+      return this.mimeTypes.forName(name);
+    } catch (MimeTypeException e) {
+      LOG.warning("Exception getting mime type by name: [" + name
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
+   * method.
+   * 
+   * @param f
+   *          The {@link File} to sense the {@link MimeType} for.
+   * @return The {@link MimeType} of the given {@link File}, or null if it
+   *         cannot be determined.
+   */
+  public MimeType getMimeType(File f) {
+    return this.mimeTypes.getMimeType(f);
+  }
+
+}

Property changes on: src/java/org/apache/nutch/util/MimeUtil.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(revision 620222)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(working copy)
@@ -24,8 +24,6 @@
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.MalformedPatternException;
 import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeUtils;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -46,6 +44,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.MimeUtil;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
@@ -79,7 +78,7 @@
   private boolean MAGIC;
 
   /** Get the MimeTypes resolver instance. */
-  private static MimeUtils MIME; 
+  private static MimeUtil MIME; 
   
   public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
@@ -193,14 +192,9 @@
         // } else {
         //   contentType = MIME.getMimeType(url);
         // }
-        mimeType = MIME.getRepository().getMimeType(url);
+        mimeType = MIME.getMimeType(url);
     } else {
-        try {
-            mimeType = new MimeType(contentType);
-        } catch (MimeTypeException e) {
-            if (LOG.isWarnEnabled()) { LOG.warn(url + e.toString()); }
-            mimeType = null;
-        }
+            mimeType = MIME.forName(contentType);
     }
         
     // Checks if we solved the content-type.
@@ -209,8 +203,8 @@
     }
 
     contentType = mimeType.getName();
-    String primaryType = mimeType.getPrimaryType();
-    String subType = mimeType.getSubType();
+    String primaryType = mimeType.getSuperType().getName();
+    String subType = mimeType.getSubTypes().first().getName();
     // leave this for future improvement
     //MimeTypeParameterList parameterList = mimeType.getParameters()
 
@@ -279,9 +273,8 @@
 
   public void setConf(Configuration conf) {
     this.conf = conf;
-    MAGIC = conf.getBoolean("mime.type.magic", true);
-    if(MIME == null)
-      MIME = new MimeUtils(getConf().get("mime.types.file"), MAGIC);
+    if (MIME == null)
+      MIME = new MimeUtil(conf);
   }
 
   public Configuration getConf() {
Index: src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
===================================================================
--- src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java	(revision 620222)
+++ src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java	(working copy)
@@ -42,10 +42,8 @@
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.MimeUtil;
 
-// Tika imports
-import org.apache.tika.mime.MimeUtils;
-
 
 
 /**
@@ -55,7 +53,7 @@
 public class ZipTextExtractor {
   
   /** Get the MimeTypes resolver instance. */
-  private static MimeUtils MIME;
+  private static MimeUtil MIME;
   
   public static final Log LOG = LogFactory.getLog(ZipTextExtractor.class);
 
@@ -64,9 +62,9 @@
   
   /** Creates a new instance of ZipTextExtractor */
   public ZipTextExtractor(Configuration conf) {
-      this.conf = conf;
-      if(this.MIME == null)
-        this.MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
+    this.conf = conf;
+    if (this.MIME == null)
+      this.MIME = new MimeUtil(conf);
   }
   
   public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -96,7 +94,7 @@
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
-          String contentType = MIME.getRepository().getMimeType(fname).getName();
+          String contentType = MIME.getMimeType(fname).getName();
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(revision 620222)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(working copy)
@@ -26,6 +26,7 @@
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
@@ -32,7 +33,6 @@
 
 // Tika imports
 import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeUtils;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
@@ -76,7 +76,7 @@
   private final File file;
   private Configuration conf;
   
-  private static MimeUtils MIME;
+  private static MimeUtil MIME;
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -103,7 +103,7 @@
     this.conf = conf;
     
     if(MIME == null)
-      MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
+      MIME = new MimeUtil(conf);
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);
@@ -210,7 +210,7 @@
     headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
         .lastModified()));
     
-    MimeType mimeType = MIME.getRepository().getMimeType(f);
+    MimeType mimeType = MIME.getMimeType(f);
     String mimeTypeString = mimeType != null ? mimeType.getName() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);