Index: CHANGES.txt
===================================================================
--- CHANGES.txt	(revision 620222)
+++ CHANGES.txt	(working copy)
@@ -202,6 +202,8 @@
 
 71. NUTCH-607 - Update build.xml to include tika jar when building war (kubes)
 
+72. NUTCH-608 - Upgrade nutch to use released apache-tika-0.1-incubating (mattmann)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline
Index: lib/tika-0.1-dev.jar
===================================================================
Cannot display: file marked as binary type.
svn:mime-type = application/octet-stream
Index: lib/tika-0.1-incubating.jar
===================================================================
Cannot display: file marked as binary type.
svn:mime-type = application/octet-stream

Property changes on: lib/tika-0.1-incubating.jar
___________________________________________________________________
Name: svn:mime-type
   + application/octet-stream

Index: src/java/org/apache/nutch/parse/ParserFactory.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserFactory.java	(revision 620222)
+++ src/java/org/apache/nutch/parse/ParserFactory.java	(working copy)
@@ -36,26 +36,27 @@
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.LogUtil;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.ObjectCache;
 
 // Tika imports
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypesFactory;
 
+/** Creates and caches {@link Parser} plugins. */
+public final class ParserFactory {
 
-/** Creates and caches {@link Parser} plugins.*/
-public final class ParserFactory {
-  
   public static final Log LOG = LogFactory.getLog(ParserFactory.class);
-  
+
   /** Wildcard for default plugins. */
   public static final String DEFAULT_PLUGIN = "*";
-  
+
   /** Empty extension list for caching purposes. */
   private final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;
-  
+
   private Configuration conf;
+
   private ExtensionPoint extensionPoint;
+
   private ParsePluginList parsePluginList;
 
   public ParserFactory(Configuration conf) {
@@ -63,10 +64,12 @@
     ObjectCache objectCache = ObjectCache.get(conf);
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
         Parser.X_POINT_ID);
-    this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName());
+    this.parsePluginList = (ParsePluginList) objectCache
+        .getObject(ParsePluginList.class.getName());
     if (this.parsePluginList == null) {
       this.parsePluginList = new ParsePluginsReader().parse(conf);
-      objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList);
+      objectCache.setObject(ParsePluginList.class.getName(),
+          this.parsePluginList);
     }
 
     if (this.extensionPoint == null) {
@@ -76,30 +79,31 @@
       throw new RuntimeException(
           "Parse Plugins preferences could not be loaded.");
     }
-  }                      
-  
-   
+  }
+
   /**
    * Function returns an array of {@link Parser}s for a given content type.
-   *
+   * 
    * The function consults the internal list of parse plugins for the
-   * ParserFactory to determine the list of pluginIds, then gets the
-   * appropriate extension points to instantiate as {@link Parser}s.
-   *
-   * @param contentType The contentType to return the <code>Array</code>
-   *                    of {@link Parser}s for.
-   * @param url The url for the content that may allow us to get the type from
-   *            the file suffix.
-   * @return An <code>Array</code> of {@link Parser}s for the given contentType.
-   *         If there were plugins mapped to a contentType via the
-   *         <code>parse-plugins.xml</code> file, but never enabled via
-   *         the <code>plugin.includes</code> Nutch conf, then those plugins
-   *         won't be part of this array, i.e., they will be skipped.
-   *         So, if the ordered list of parsing plugins for
-   *         <code>text/plain</code> was <code>[parse-text,parse-html,
-   *         parse-rtf]</code>, and only <code>parse-html</code> and
-   *         <code>parse-rtf</code> were enabled via
-   *         <code>plugin.includes</code>, then this ordered Array would
+   * ParserFactory to determine the list of pluginIds, then gets the appropriate
+   * extension points to instantiate as {@link Parser}s.
+   * 
+   * @param contentType
+   *          The contentType to return the <code>Array</code> of
+   *          {@link Parser}s for.
+   * @param url
+   *          The url for the content that may allow us to get the type from the
+   *          file suffix.
+   * @return An <code>Array</code> of {@link Parser}s for the given
+   *         contentType. If there were plugins mapped to a contentType via the
+   *         <code>parse-plugins.xml</code> file, but never enabled via the
+   *         <code>plugin.includes</code> Nutch conf, then those plugins won't
+   *         be part of this array, i.e., they will be skipped. So, if the
+   *         ordered list of parsing plugins for <code>text/plain</code> was
+   *         <code>[parse-text,parse-html,
+   *         parse-rtf]</code>, and only
+   *         <code>parse-html</code> and <code>parse-rtf</code> were enabled
+   *         via <code>plugin.includes</code>, then this ordered Array would
    *         consist of two {@link Parser} interfaces,
    *         <code>[parse-html, parse-rtf]</code>.
    */
@@ -104,17 +108,17 @@
    *         <code>[parse-html, parse-rtf]</code>.
    */
   public Parser[] getParsers(String contentType, String url)
-  throws ParserNotFound {
-    
+      throws ParserNotFound {
+
     List<Parser> parsers = null;
     List parserExts = null;
-    
+
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     // TODO once the MimeTypes is available
     // parsers = getExtensions(MimeUtils.map(contentType));
     // if (parsers != null) {
-    //   return parsers;
+    // return parsers;
     // }
     // Last Chance: Guess content-type from file url...
     // parsers = getExtensions(MimeUtils.getMimeType(url));
@@ -125,11 +129,11 @@
     }
 
     parsers = new Vector<Parser>(parserExts.size());
-    for (Iterator i=parserExts.iterator(); i.hasNext(); ){
+    for (Iterator i = parserExts.iterator(); i.hasNext();) {
       Extension ext = (Extension) i.next();
       Parser p = null;
       try {
-        //check to see if we've cached this parser instance yet
+        // check to see if we've cached this parser instance yet
         p = (Parser) objectCache.getObject(ext.getId());
         if (p == null) {
           // go ahead and instantiate it and then cache it
@@ -134,7 +138,7 @@
         if (p == null) {
           // go ahead and instantiate it and then cache it
           p = (Parser) ext.getExtensionInstance();
-          objectCache.setObject(ext.getId(),p);
+          objectCache.setObject(ext.getId(), p);
         }
         parsers.add(p);
       } catch (PluginRuntimeException e) {
@@ -141,16 +145,15 @@
         if (LOG.isWarnEnabled()) {
           e.printStackTrace(LogUtil.getWarnStream(LOG));
           LOG.warn("ParserFactory:PluginRuntimeException when "
-                 + "initializing parser plugin "
-                 + ext.getDescriptor().getPluginId()
-                 + " instance in getParsers "
-                 + "function: attempting to continue instantiating parsers");
+              + "initializing parser plugin "
+              + ext.getDescriptor().getPluginId() + " instance in getParsers "
+              + "function: attempting to continue instantiating parsers");
         }
       }
     }
-    return parsers.toArray(new Parser[]{});
+    return parsers.toArray(new Parser[] {});
   }
-    
+
   /**
    * Function returns a {@link Parser} instance with the specified
    * <code>extId</code>, representing its extension ID. If the Parser
@@ -156,20 +159,22 @@
    * <code>extId</code>, representing its extension ID. If the Parser
    * instance isn't found, then the function throws a
    * <code>ParserNotFound</code> exception. If the function is able to find
-   * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it
-   * will return the already instantiated Parser. Otherwise, if it has to
-   * instantiate the Parser itself , then this function will cache that Parser
-   * in the internal <code>PARSER_CACHE</code>.
+   * the {@link Parser} in the internal <code>PARSER_CACHE</code> then it will
+   * return the already instantiated Parser. Otherwise, if it has to instantiate
+   * the Parser itself , then this function will cache that Parser in the
+   * internal <code>PARSER_CACHE</code>.
    * 
-   * @param id The string extension ID (e.g.,
-   *        "org.apache.nutch.parse.rss.RSSParser",
-   *        "org.apache.nutch.parse.rtf.RTFParseFactory") of the {@link Parser}
-   *        implementation to return.
+   * @param id
+   *          The string extension ID (e.g.,
+   *          "org.apache.nutch.parse.rss.RSSParser",
+   *          "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+   *          {@link Parser} implementation to return.
    * @return A {@link Parser} implementation specified by the parameter
    *         <code>id</code>.
-   * @throws ParserNotFound If the Parser is not found (i.e., registered with
-   *         the extension point), or if the there a
-   *         {@link PluginRuntimeException} instantiating the {@link Parser}.
+   * @throws ParserNotFound
+   *           If the Parser is not found (i.e., registered with the extension
+   *           point), or if the there a {@link PluginRuntimeException}
+   *           instantiating the {@link Parser}.
    */
   public Parser getParserById(String id) throws ParserNotFound {
 
@@ -177,7 +182,7 @@
     Extension parserExt = null;
 
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     if (id != null) {
       parserExt = getExtension(extensions, id);
     }
@@ -188,12 +193,12 @@
     if (parserExt == null) {
       throw new ParserNotFound("No Parser Found for id [" + id + "]");
     }
-    
-    // first check the cache	    	   
+
+    // first check the cache
     if (objectCache.getObject(parserExt.getId()) != null) {
       return (Parser) objectCache.getObject(parserExt.getId());
 
-    // if not found in cache, instantiate the Parser    
+      // if not found in cache, instantiate the Parser
     } else {
       try {
         Parser p = (Parser) parserExt.getExtensionInstance();
@@ -201,9 +206,9 @@
         return p;
       } catch (PluginRuntimeException e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("Canno initialize parser " +
-                   parserExt.getDescriptor().getPluginId() +
-                   " (cause: " + e.toString());
+          LOG.warn("Canno initialize parser "
+              + parserExt.getDescriptor().getPluginId() + " (cause: "
+              + e.toString());
         }
         throw new ParserNotFound("Cannot init parser for id [" + id + "]");
       }
@@ -209,16 +214,17 @@
       }
     }
   }
-  
+
   /**
    * Finds the best-suited parse plugin for a given contentType.
    * 
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return a list of extensions to be used for this contentType.
-   *         If none, returns <code>null</code>.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return a list of extensions to be used for this contentType. If none,
+   *         returns <code>null</code>.
    */
   protected List<Extension> getExtensions(String contentType) {
-    
+
     ObjectCache objectCache = ObjectCache.get(conf);
     // First of all, tries to clean the content-type
     String type = null;
@@ -223,11 +229,13 @@
     // First of all, tries to clean the content-type
     String type = null;
     try {
-      type = MimeType.clean(contentType);
-    } catch (MimeTypeException mte) {
+      type = MimeTypesFactory.create(
+          conf.getConfResourceAsInputStream(conf.get("mime.types.file")))
+          .forName(MimeUtil.cleanMimeType(contentType)).getName();
+    } catch (Exception mte) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Could not clean the content-type [" + contentType +
-                  "], Reason is [" + mte + "]. Using its raw version...");
+        LOG.debug("Could not clean the content-type [" + contentType
+            + "], Reason is [" + mte + "]. Using its raw version...");
       }
       type = contentType;
     }
@@ -239,7 +247,7 @@
     if (extensions == EMPTY_EXTENSION_LIST) {
       return null;
     }
-    
+
     if (extensions == null) {
       extensions = findExtensions(type);
       if (extensions != null) {
@@ -245,8 +253,8 @@
       if (extensions != null) {
         objectCache.setObject(type, extensions);
       } else {
-      	// Put the empty extension list into cache
-      	// to remember we don't know any related extension.
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
         objectCache.setObject(type, EMPTY_EXTENSION_LIST);
       }
     }
@@ -252,29 +260,31 @@
     }
     return extensions;
   }
-  
+
   /**
    * searches a list of suitable parse plugins for the given contentType.
-   * <p>It first looks for a preferred plugin defined in the parse-plugin
-   * file.  If none is found, it returns a list of default plugins.
+   * <p>
+   * It first looks for a preferred plugin defined in the parse-plugin file. If
+   * none is found, it returns a list of default plugins.
    * 
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return List - List of extensions to be used for this contentType.
-   *                If none, returns null.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
    */
   private List<Extension> findExtensions(String contentType) {
-    
+
     Extension[] extensions = this.extensionPoint.getExtensions();
-    
+
     // Look for a preferred plugin.
-    List<String> parsePluginList =
-      this.parsePluginList.getPluginList(contentType);
-    List<Extension> extensionList =
-      matchExtensions(parsePluginList, extensions, contentType);
+    List<String> parsePluginList = this.parsePluginList
+        .getPluginList(contentType);
+    List<Extension> extensionList = matchExtensions(parsePluginList,
+        extensions, contentType);
     if (extensionList != null) {
       return extensionList;
     }
-    
+
     // If none found, look for a default plugin.
     parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
     return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
@@ -279,30 +289,33 @@
     parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
     return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
   }
-  
+
   /**
    * Tries to find a suitable parser for the given contentType.
    * <ol>
-   * <li>It checks if a parser which accepts the contentType
-   * can be found in the <code>plugins</code> list;</li>
-   * <li>If this list is empty, it tries to find amongst the loaded
-   * extensions whether some of them might suit and warns the user.</li>
+   * <li>It checks if a parser which accepts the contentType can be found in
+   * the <code>plugins</code> list;</li>
+   * <li>If this list is empty, it tries to find amongst the loaded extensions
+   * whether some of them might suit and warns the user.</li>
    * </ol>
-   * @param plugins List of candidate plugins.
-   * @param extensions Array of loaded extensions.
-   * @param contentType Content-Type for which we seek a parse plugin.
-   * @return List - List of extensions to be used for this contentType.
-   *                If none, returns null.
+   * 
+   * @param plugins
+   *          List of candidate plugins.
+   * @param extensions
+   *          Array of loaded extensions.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
    */
   private List<Extension> matchExtensions(List<String> plugins,
-                               Extension[] extensions,
-                               String contentType) {
-    
+      Extension[] extensions, String contentType) {
+
     List<Extension> extList = new ArrayList<Extension>();
     if (plugins != null) {
-      
+
       for (String parsePluginId : plugins) {
-        
+
         Extension ext = getExtension(extensions, parsePluginId, contentType);
         // the extension returned may be null
         // that means that it was not enabled in the plugin.includes
@@ -308,16 +321,16 @@
         // that means that it was not enabled in the plugin.includes
         // nutch conf property, but it was mapped in the
         // parse-plugins.xml
-        // file. 
+        // file.
         // OR it was enabled in plugin.includes, but the plugin's plugin.xml
         // file does not claim that the plugin supports the specified mimeType
         // in either case, LOG the appropriate error message to WARN level
-        
+
         if (ext == null) {
-          //try to get it just by its pluginId
+          // try to get it just by its pluginId
           ext = getExtension(extensions, parsePluginId);
-          
-          if (LOG.isWarnEnabled()) { 
+
+          if (LOG.isWarnEnabled()) {
             if (ext != null) {
               // plugin was enabled via plugin.includes
               // its plugin.xml just doesn't claim to support that
@@ -322,17 +335,17 @@
               // plugin was enabled via plugin.includes
               // its plugin.xml just doesn't claim to support that
               // particular mimeType
-              LOG.warn("ParserFactory:Plugin: " + parsePluginId +
-                       " mapped to contentType " + contentType +
-                       " via parse-plugins.xml, but " + "its plugin.xml " +
-                       "file does not claim to support contentType: " +
-                       contentType);
+              LOG.warn("ParserFactory:Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but " + "its plugin.xml "
+                  + "file does not claim to support contentType: "
+                  + contentType);
             } else {
               // plugin wasn't enabled via plugin.includes
-              LOG.warn("ParserFactory: Plugin: " + parsePluginId + 
-                       " mapped to contentType " + contentType +
-                       " via parse-plugins.xml, but not enabled via " +
-                       "plugin.includes in nutch-default.xml");                     
+              LOG.warn("ParserFactory: Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but not enabled via "
+                  + "plugin.includes in nutch-default.xml");
             }
           }
         }
@@ -342,7 +355,7 @@
           extList.add(ext);
         }
       }
-      
+
     } else {
       // okay, there were no list of plugins defined for
       // this mimeType, however, there may be plugins registered
@@ -351,29 +364,28 @@
       // so, iterate through the list of extensions and if you find
       // any extensions where this is the case, throw a
       // NotMappedParserException
-      
-      for (int i=0; i<extensions.length; i++) {
+
+      for (int i = 0; i < extensions.length; i++) {
         if (extensions[i].getAttribute("contentType") != null
-            && extensions[i].getAttribute("contentType").equals(
-                contentType)) {
+            && extensions[i].getAttribute("contentType").equals(contentType)) {
           extList.add(extensions[i]);
         }
       }
-      
+
       if (extList.size() > 0) {
         if (LOG.isInfoEnabled()) {
-          LOG.info("The parsing plugins: " + extList +
-                   " are enabled via the plugin.includes system " +
-                   "property, and all claim to support the content type " +
-                   contentType + ", but they are not mapped to it  in the " +
-                   "parse-plugins.xml file");
+          LOG.info("The parsing plugins: " + extList
+              + " are enabled via the plugin.includes system "
+              + "property, and all claim to support the content type "
+              + contentType + ", but they are not mapped to it  in the "
+              + "parse-plugins.xml file");
         }
       } else if (LOG.isDebugEnabled()) {
-        LOG.debug("ParserFactory:No parse plugins mapped or enabled for " +
-                  "contentType " + contentType);
+        LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+            + "contentType " + contentType);
       }
     }
-    
+
     return (extList.size() > 0) ? extList : null;
   }
 
@@ -378,14 +390,13 @@
   }
 
   private boolean match(Extension extension, String id, String type) {
-    return ((id.equals(extension.getId())) &&
-            (type.equals(extension.getAttribute("contentType")) ||
-             type.equals(DEFAULT_PLUGIN)));
+    return ((id.equals(extension.getId())) && (type.equals(extension
+        .getAttribute("contentType")) || type.equals(DEFAULT_PLUGIN)));
   }
-  
+
   /** Get an extension from its id and supported content-type. */
   private Extension getExtension(Extension[] list, String id, String type) {
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       if (match(list[i], id, type)) {
         return list[i];
       }
@@ -392,9 +403,9 @@
     }
     return null;
   }
-    
+
   private Extension getExtension(Extension[] list, String id) {
-    for (int i=0; i<list.length; i++) {
+    for (int i = 0; i < list.length; i++) {
       if (id.equals(list[i].getId())) {
         return list[i];
       }
@@ -401,7 +412,7 @@
     }
     return null;
   }
-  
+
   private Extension getExtensionFromAlias(Extension[] list, String id) {
     return getExtension(list, parsePluginList.getAliases().get(id));
   }
Index: src/java/org/apache/nutch/protocol/Content.java
===================================================================
--- src/java/org/apache/nutch/protocol/Content.java	(revision 620222)
+++ src/java/org/apache/nutch/protocol/Content.java	(working copy)
@@ -17,7 +17,7 @@
 
 package org.apache.nutch.protocol;
 
-//JDK imports
+// JDK imports
 import java.io.ByteArrayInputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
@@ -26,7 +26,7 @@
 import java.util.Arrays;
 import java.util.zip.InflaterInputStream;
 
-//Hadoop imports
+// Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -36,17 +36,18 @@
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.hadoop.io.Writable;
 
-//Nutch imports
+// Nutch imports
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
-//Tika imports
+// Tika imports
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeUtils;
-
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
 
-public final class Content implements Writable{
+public final class Content implements Writable {
 
   public static final String DIR_NAME = "content";
 
@@ -66,7 +67,7 @@
 
   private boolean mimeTypeMagic;
 
-  private static MimeUtils mimeTypes;
+  private static MimeTypes mimeTypes;
 
   public Content() {
     metadata = new Metadata();
@@ -89,8 +90,10 @@
     this.content = content;
     this.metadata = metadata;
     this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
-    if(this.mimeTypes == null){
-      this.mimeTypes = new MimeUtils(conf.get("mime.types.file"), this.mimeTypeMagic);
+    if (this.mimeTypes == null) {
+      this.mimeTypes = MimeTypesFactory.create(conf
+          .getConfResourceAsInputStream(conf.get("mime.types.file")));
+
     }
     this.contentType = getContentType(contentType, url, content);
   }
@@ -129,11 +132,11 @@
       metadata.readFields(in); // read meta data
       break;
     default:
-      throw new VersionMismatchException((byte)2, oldVersion);
+      throw new VersionMismatchException((byte) 2, oldVersion);
     }
 
   }
-  
+
   public final void readFields(DataInput in) throws IOException {
     metadata.clear();
     int sizeOrVersion = in.readInt();
@@ -151,7 +154,7 @@
         metadata.readFields(in);
         break;
       default:
-        throw new VersionMismatchException((byte)VERSION, (byte)version);
+        throw new VersionMismatchException((byte) VERSION, (byte) version);
       }
     } else { // size
       byte[] compressed = new byte[sizeOrVersion];
@@ -157,8 +160,8 @@
       byte[] compressed = new byte[sizeOrVersion];
       in.readFully(compressed, 0, compressed.length);
       ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
-      DataInput inflater =
-        new DataInputStream(new InflaterInputStream(deflated));
+      DataInput inflater = new DataInputStream(
+          new InflaterInputStream(deflated));
       readFieldsCompressed(inflater);
     }
   }
@@ -192,8 +195,9 @@
     return url;
   }
 
-  /** The base url for relative links contained in the content.
-   * Maybe be different from url if the request redirected.
+  /**
+   * The base url for relative links contained in the content. Maybe be
+   * different from url if the request redirected.
    */
   public String getBaseUrl() {
     return base;
@@ -208,7 +212,9 @@
     this.content = content;
   }
 
-  /** The media type of the retrieved content.
+  /**
+   * The media type of the retrieved content.
+   * 
    * @see <a href="http://www.iana.org/assignments/media-types/">
    *      http://www.iana.org/assignments/media-types/</a>
    */
@@ -293,7 +299,9 @@
     String cleanedMimeType = null;
 
     try {
-      cleanedMimeType = MimeType.clean(typeName);
+      cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
+          .forName(MimeUtil.cleanMimeType(typeName)).getName()
+          : null;
     } catch (MimeTypeException mte) {
       // Seems to be a malformed mime type name...
     }
@@ -299,16 +307,20 @@
     }
 
     // first try to get the type from the cleaned type name
-    type = cleanedMimeType != null ? this.mimeTypes.getRepository().forName(
-        cleanedMimeType) : null;
+    try {
+      type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
+          : null;
+    } catch (MimeTypeException e) {
+      type = null;
+    }
 
-    // if returned null, then try url resolution
-    if (type == null) {
+    // if returned null, or if it's the default type then try url resolution
+    if (type == null
+        || (type != null && type.getName().equals(MimeTypes.DEFAULT))) {
       // If no mime-type header, or cannot find a corresponding registered
       // mime-type, then guess a mime-type from the url pattern
-      type = this.mimeTypes.getRepository().getMimeType(url) != null ? this.mimeTypes
-          .getRepository().getMimeType(url)
-          : type;
+      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
+          .getMimeType(url) : type;
     }
 
     // if magic is enabled use mime magic to guess if the mime type returned
@@ -313,14 +325,24 @@
 
     // if magic is enabled use mime magic to guess if the mime type returned
     // from the magic guess is different than the one that's already set so far
-    // if it is, go with the mime type returned by the magic
+    // if it is, and it's not the default mime type, then go with the mime type
+    // returned by the magic
     if (this.mimeTypeMagic) {
-      MimeType magicType = this.mimeTypes.getRepository().getMimeType(data);
-      if (magicType != null && !type.getName().equals(magicType.getName())) {
+      MimeType magicType = this.mimeTypes.getMimeType(data);
+      if (magicType != null && !magicType.getName().equals(MimeTypes.DEFAULT)
+          && type != null && !type.getName().equals(magicType.getName())) {
         // If magic enabled and the current mime type differs from that of the
         // one returned from the magic, take the magic mimeType
+        type = magicType;
+      }
 
-        type = magicType;
+      // if type is STILL null after all the resolution strategies, go for the
+      // default type
+      if (type == null) {
+        try {
+          type = this.mimeTypes.forName(MimeTypes.DEFAULT);
+        } catch (Exception ignore) {
+        }
       }
     }
 
@@ -326,4 +348,5 @@
 
     return type.getName();
   }
+
 }
Index: src/java/org/apache/nutch/util/MimeUtil.java
===================================================================
--- src/java/org/apache/nutch/util/MimeUtil.java	(revision 0)
+++ src/java/org/apache/nutch/util/MimeUtil.java	(revision 0)
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+/**
+ * @author mattmann
+ * @since NUTCH-608
+ * 
+ * <p>
+ * This is a utility class to replace the basic functionality of the now
+ * non-existant MimeType.clean method, that used to exist in Tika. Once <a
+ * href="https://issues.apache.org/jira/browse/TIKA-121">TIKA-121</a> is
+ * addressed, and Nutch subsequently upgraded, then this class may
+ * dissappear.
+ * </p>
+ */
+public final class MimeUtil {
+
+  private static final String SEPARATOR = ";";
+
+  public static String cleanMimeType(String origType) {
+    if(origType == null) return null;
+    
+    // take the origType and split it on ';'
+    String[] tokenizedMimeType = origType.split(SEPARATOR);
+    if (tokenizedMimeType.length > 1) {
+      // there was a ';' in there, take the first value
+      return tokenizedMimeType[0];
+    } else {
+      // there wasn't a ';', so just return the orig type
+      return origType;
+    }
+  }
+
+}

Property changes on: src/java/org/apache/nutch/util/MimeUtil.java
___________________________________________________________________
Name: svn:eol-style
   + native

Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(revision 620222)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java	(working copy)
@@ -25,7 +25,9 @@
 import org.apache.oro.text.regex.MalformedPatternException;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeUtils;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -46,6 +48,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.MimeUtil;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
@@ -50,6 +53,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
+import java.io.IOException;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 
@@ -79,7 +83,7 @@
   private boolean MAGIC;
 
   /** Get the MimeTypes resolver instance. */
-  private static MimeUtils MIME; 
+  private static MimeTypes MIME; 
   
   public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
@@ -193,10 +197,10 @@
         // } else {
         //   contentType = MIME.getMimeType(url);
         // }
-        mimeType = MIME.getRepository().getMimeType(url);
+        mimeType = MIME.getMimeType(url);
     } else {
         try {
-            mimeType = new MimeType(contentType);
+            mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
         } catch (MimeTypeException e) {
             if (LOG.isWarnEnabled()) { LOG.warn(url + e.toString()); }
             mimeType = null;
@@ -209,8 +213,8 @@
     }
 
     contentType = mimeType.getName();
-    String primaryType = mimeType.getPrimaryType();
-    String subType = mimeType.getSubType();
+    String primaryType = mimeType.getSuperType().getName();
+    String subType = mimeType.getSubTypes().first().getName();
     // leave this for future improvement
     //MimeTypeParameterList parameterList = mimeType.getParameters()
 
@@ -280,8 +284,10 @@
   public void setConf(Configuration conf) {
     this.conf = conf;
     MAGIC = conf.getBoolean("mime.type.magic", true);
-    if(MIME == null)
-      MIME = new MimeUtils(getConf().get("mime.types.file"), MAGIC);
+    if (MIME == null) {
+      MIME = MimeTypesFactory.create(getConf().getConfResourceAsInputStream(
+          conf.get("mime.types.file")));
+    }
   }
 
   public Configuration getConf() {
Index: src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
===================================================================
--- src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java	(revision 620222)
+++ src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java	(working copy)
@@ -44,7 +44,8 @@
 import org.apache.nutch.protocol.Content;
 
 // Tika imports
-import org.apache.tika.mime.MimeUtils;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
 
 
 
@@ -55,7 +56,7 @@
 public class ZipTextExtractor {
   
   /** Get the MimeTypes resolver instance. */
-  private static MimeUtils MIME;
+  private static MimeTypes MIME;
   
   public static final Log LOG = LogFactory.getLog(ZipTextExtractor.class);
 
@@ -64,9 +65,10 @@
   
   /** Creates a new instance of ZipTextExtractor */
   public ZipTextExtractor(Configuration conf) {
-      this.conf = conf;
-      if(this.MIME == null)
-        this.MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
+    this.conf = conf;
+    if (this.MIME == null)
+      this.MIME = MimeTypesFactory.create(conf
+          .getConfResourceAsInputStream(conf.get("mime.types.file")));
   }
   
   public String extractText(InputStream input, String url, List outLinksList) throws IOException {
@@ -96,7 +98,7 @@
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
-          String contentType = MIME.getRepository().getMimeType(fname).getName();
+          String contentType = MIME.getMimeType(fname).getName();
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(revision 620222)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java	(working copy)
@@ -32,7 +32,8 @@
 
 // Tika imports
 import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeUtils;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
@@ -76,7 +77,7 @@
   private final File file;
   private Configuration conf;
   
-  private static MimeUtils MIME;
+  private static MimeTypes MIME;
 
   /** Returns the response code. */
   public int getCode() { return code; }
@@ -103,7 +104,7 @@
     this.conf = conf;
     
     if(MIME == null)
-      MIME = new MimeUtils(conf.get("mime.types.file"),conf.getBoolean("mime.type.magic", true));
+      MIME = MimeTypesFactory.create(conf.getConfResourceAsInputStream(conf.get("mime.types.file")));
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);
@@ -210,7 +211,7 @@
     headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f
         .lastModified()));
     
-    MimeType mimeType = MIME.getRepository().getMimeType(f);
+    MimeType mimeType = MIME.getMimeType(f);
     String mimeTypeString = mimeType != null ? mimeType.getName() : "";
     headers.set(Response.CONTENT_TYPE, mimeTypeString);
 
