Index: CHANGES.txt
===================================================================
--- CHANGES.txt	(revision 1355796)
+++ CHANGES.txt	(working copy)
@@ -37,6 +37,10 @@
     --password=X command line option to specify the password that Tika CLI
     should use for opening encrypted documents (TIKA-943).
 
+  * PDF documents embedded in a Microsoft Office document are now
+    extracted correctly (previously they were incorrectly extracted as
+    Microsoft Works files) (TIKA-948)
+
 Release 1.1 - 3/7/2012
 ---------------------------------
 
Index: tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java	(revision 1355796)
+++ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java	(working copy)
@@ -59,6 +59,11 @@
         0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
     };
 
+    /** An ASCII String "Acrobat Document" */
+    private static final byte[] ADOBE_ACROBAT = new byte[] {
+        0x41, 0x63, 0x72, 0x6f, 0x62, 0x61, 0x74, 0x20, 0x44, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74
+    };
+
     /** The OLE base file format */
     public static final MediaType OLE = application("x-tika-msoffice");
     
@@ -85,6 +90,9 @@
 
     /** Microsoft Works */
     public static final MediaType WPS = application("vnd.ms-works");
+
+    /** Adobe Portable Document Format (PDF) */
+    public static final MediaType PDF = application("pdf");
     
     /** Microsoft Works Spreadsheet 7.0 */
     public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
@@ -186,25 +194,35 @@
      */
     protected static MediaType detect(Set<String> names, DirectoryEntry root) {
         if (names != null) {
+            byte[] compObjBytes;
+            if (root != null) {
+                compObjBytes = getCompObjContents(root);
+            } else {
+                compObjBytes = null;
+            }
+
             if (names.contains("StarCalcDocument")) {
                 // Star Office Calc
                 return SDC;
             } else if (names.contains("StarWriterDocument")) {
                 return SDW;
             } else if (names.contains("StarDrawDocument3")) {
-                if (root == null) {
-                    /*
-                     * This is either StarOfficeDraw or StarOfficeImpress, we have
-                     * to consult the CompObj to distinguish them, if this method is
-                     * called in "legacy mode", without the root, just return
-                     * x-tika-msoffice. The one-argument method is only for backward
-                     * compatibility, if someone calls old API he/she can get the
-                     * old result.
-                     */
-                    return OLE;
-                } else {
-                    return processStarDrawOrImpress(root);
+                if (root != null && compObjBytes != null) {
+                    if (arrayContains(compObjBytes, STAR_DRAW)) {
+                        return SDA;
+                    } else if (arrayContains(compObjBytes, STAR_IMPRESS)) {
+                        return SDD;
+                    }
                 }
+                /*
+                 * This is either StarOfficeDraw or StarOfficeImpress, we have
+                 * to consult the CompObj to distinguish them, if this method is
+                 * called in "legacy mode", without the root, just return
+                 * x-tika-msoffice. The one-argument method is only for backward
+                 * compatibility, if someone calls old API he/she can get the
+                 * old result.
+                 */
+                return OLE;
             } else if (names.contains("WksSSWorkBook")) {
                 // This check has to be before names.contains("Workbook")
                 // Works 7.0 spreadsheet files contain both
@@ -236,25 +254,29 @@
             } else if (names.contains("MatOST")) {
                 // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
                 return WPS;
+            } else if (compObjBytes != null && arrayContains(compObjBytes, ADOBE_ACROBAT)) {
+                // TODO: handle other non-office types too!
+                return PDF;
             } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
-               // Newer Works files
-               return WPS;
+                // Newer Works files
+                return WPS;
             } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
-               // Normally an older Works file
-               return WPS;
+                // Normally an older Works file
+                return WPS;
             } else if (names.contains("CONTENTS")) {
-               // CONTENTS without SPELLING nor CompObj normally means some sort
-               //  of embedded non-office file inside an OLE2 document
-               // This is most commonly triggered on nested directories
-               return OLE;
+                // CONTENTS without SPELLING nor CompObj normally means some sort
+                //  of embedded non-office file inside an OLE2 document
+                // This is most commonly triggered on nested
+                //  directories
+                return OLE;
             } else if (names.contains("\u0001CompObj") &&
                   (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
-               // Could be Project, look for common name patterns
-               for (String name : names) {
-                  if (mppDataMatch.matcher(name).matches()) {
-                     return MPP;
-                  }
-               }
+                // Could be Project, look for common name patterns
+                for (String name : names) {
+                   if (mppDataMatch.matcher(name).matches()) {
+                      return MPP;
+                   }
+                }
             } else if (names.contains("PerfectOffice_MAIN")) {
                 if (names.contains("SlideShow")) {
                     return MediaType.application("x-corelpresentations"); // .shw
@@ -276,24 +298,19 @@
         return OLE;
     }
 
-    private static MediaType processStarDrawOrImpress(DirectoryEntry root) {
+    static byte[] getCompObjContents(DirectoryEntry root) {
+        /*
+         * This array contains a string with a normal ASCII name of the
+         * application used to create this file. We want to search for that
+         * name.
+         */
         try {
             Entry e = root.getEntry("\u0001CompObj");
             if (e != null && e.isDocumentEntry()) {
                 DocumentNode dn = (DocumentNode)e;
                 DocumentInputStream stream = new DocumentInputStream(dn);
-                byte [] bytes = IOUtils.toByteArray(stream);
-                /*
-                 * This array contains a string with a normal ASCII name of the
-                 * application used to create this file. We want to search for that
-                 * name.
-                 */
-                if ( arrayContains(bytes, STAR_DRAW) ) {
-                    return SDA;
-                } else if (arrayContains(bytes, STAR_IMPRESS)) {
-                    return SDD;
-                }
-            } 
+                return IOUtils.toByteArray(stream);
+            }
         } catch (Exception e) {
             /*
              * "root.getEntry" can throw FileNotFoundException. The code inside
@@ -305,9 +322,10 @@
              * x-tika-msoffice
              */
         }
-        return OLE;
+
+        return null;
     }
-    
+
     // poor man's search for byte arrays, replace with some library call if
     // you know one without adding new dependencies
     private static boolean arrayContains(byte [] larger, byte [] smaller) {
Index: tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java	(revision 1355796)
+++ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java	(working copy)
@@ -17,22 +17,27 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.DocumentNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -101,11 +106,20 @@
 
         // What kind of document is it?
         Metadata metadata = new Metadata();
-        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+
+        // NOTE: logic copied from OfficeParser.POIFSDocumentType.DetectType:
+        Set<String> names = new HashSet<String>();
+        for (Entry entry : dir) {
+          names.add(entry.getName());
+        }
+        MediaType type = POIFSContainerDetector.detect(names, dir);
+
+        POIFSDocumentType officeType = POIFSDocumentType.typeFromMediaType(type);
+
         TikaInputStream embedded = null;
 
         try {
-            if (type == POIFSDocumentType.OLE10_NATIVE) {
+            if (officeType == POIFSDocumentType.OLE10_NATIVE) {
                 try {
                     // Try to un-wrap the OLE10Native record:
                     Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
@@ -116,9 +130,14 @@
                 } catch (Ole10NativeException ex) {
                     // Not a valid OLE10Native record, skip it
                 }
+            } else if (officeType == POIFSDocumentType.UNKNOWN && type == POIFSContainerDetector.PDF) {
+                // TODO: handle other non-office documents too!
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + ".pdf");
+                embedded = TikaInputStream.get(new DocumentInputStream((DocumentNode) dir.getEntry("CONTENTS")));
             } else {
-                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
-                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+                metadata.set(Metadata.CONTENT_TYPE, officeType.getType().toString());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + officeType.getExtension());
             }
 
             // Should we parse it?
Index: tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java	(revision 1355796)
+++ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java	(working copy)
@@ -109,15 +109,18 @@
         }
 
         public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
-           return detectType(fs.getRoot());
-       }
+            return detectType(fs.getRoot());
+        }
 
         public static POIFSDocumentType detectType(DirectoryEntry node) {
             Set<String> names = new HashSet<String>();
             for (Entry entry : node) {
                 names.add(entry.getName());
             }
-            MediaType type = POIFSContainerDetector.detect(names, node);
+            return typeFromMediaType(POIFSContainerDetector.detect(names, node));
+        }
+
+        public static POIFSDocumentType typeFromMediaType(MediaType type) {
             for (POIFSDocumentType poifsType : values()) {
                if (type.equals(poifsType.type)) {
                   return poifsType;
Index: tika-parsers/src/test/resources/test-documents/EmbeddedPDF.doc
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/test-documents/EmbeddedPDF.doc
===================================================================
--- tika-parsers/src/test/resources/test-documents/EmbeddedPDF.doc	(revision 1355796)
+++ tika-parsers/src/test/resources/test-documents/EmbeddedPDF.doc	(working copy)

Property changes on: tika-parsers/src/test/resources/test-documents/EmbeddedPDF.doc
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java	(revision 1355796)
+++ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java	(working copy)
@@ -271,4 +271,19 @@
         assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
         assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
     }
+
+    // TIKA-948
+    public void testEmbeddedPDF() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("EmbeddedPDF.doc", extractor, false);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+
+        assertEquals("_1402837031.pdf", handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+    }
 }