From c20f8ab47d10dd5d3a8c68ce7372cf74e4b8e38a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=9C=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=92=D0=B0=D0=BB=D1=8F=D0=BD=D1=81=D0=BA=D0=B8=D0=B9?= <max.valjanski@gmail.com>
Date: Thu, 16 Dec 2010 20:35:36 +0300
Subject: [PATCH] TIKA-573: add MimeType.getExtension().
 Extensions are taken from filename patterns

---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   17 ++++++++++++++---
 .../main/java/org/apache/tika/mime/MimeType.java   |   15 +++++++++++++++
 .../main/java/org/apache/tika/mime/Patterns.java   |    6 +++++-
 .../java/org/apache/tika/mime/PatternsTest.java    |    7 +++++++
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 19f5505..bec7be6 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -38,6 +38,7 @@ import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.SimpleLayout;
 import org.apache.log4j.WriterAppender;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@@ -50,6 +51,7 @@ import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
@@ -474,26 +476,35 @@ public class TikaCLI {
 
     private static class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
         private int count = 0;
+        private final TikaConfig config = TikaConfig.getDefaultConfig();
 
         public boolean shouldParseEmbedded(Metadata metadata) {
             return true;
         }
 
-        public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
+        public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
 
             if (name == null) {
                 name = Integer.toString(count);
             }
 
+            String contentType = metadata.get(Metadata.CONTENT_TYPE);
+
+            if (name.indexOf('.')==-1 && contentType!=null) {
+                try {
+                    name += config.getMimeRepository().forName(contentType).getExtension();
+                } catch (MimeTypeException e) {
+                    e.printStackTrace();
+                }
+            }
+
             File outputFile = new File(name);
             if (outputFile.exists()) {
                 System.err.println("File '"+name+"' already exists; skipping");
                 return;
             }
 
-            String contentType = metadata.get(Metadata.CONTENT_TYPE);
-
             System.out.println("Extracting '"+name+"' ("+contentType+")");
 
             FileOutputStream os = new FileOutputStream(outputFile);
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index 402812c..0852245 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -86,6 +86,9 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
     /** The minimum length of data to provides for magic analyzis */
     private int minLength = 0;
 
+    /** Preferred extension with starting dot or empty string */
+    private String extension = "";
+
     /**
      * Creates a media type with the give name and containing media type
      * registry. The name is expected to be valid and normalized to lower
@@ -300,4 +303,16 @@ public final class MimeType implements Comparable<MimeType>, Serializable {
         return type.toString();
     }
 
+    /**
+     * Get preferred extension
+     *
+     * @return extension (with starting dot) or empty string
+     */
+    public String getExtension() {
+        return extension;
+    }
+
+    void setExtension(String extension) {
+        this.extension = extension;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
index d0caa3e..284fc2f 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
@@ -100,7 +100,11 @@ class Patterns implements Serializable {
                 addName(pattern, type);
             } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1
                     && pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) {
-                addExtension(pattern.substring(1), type);
+                String extension = pattern.substring(1);
+                addExtension(extension, type);
+                if (type.getExtension().isEmpty()) {
+                    type.setExtension(extension);
+                }
             } else {
                 addGlob(compile(pattern), type);
             }
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java
index 4d3ed8d..7ca2bbc 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java
@@ -66,4 +66,11 @@ public class PatternsTest extends TestCase {
             // expected result
         }
     }
+
+    public void testExtension() throws MimeTypeException {
+        MimeType doc = types.forName("application/vnd.ms-word");
+        patterns.add("*.doc", doc);
+
+        assertEquals(".doc", doc.getExtension());
+    }
 }
-- 
1.7.3.2

