Index: pdfbox/src/test/resources/org/apache/pdfbox/tika/testPDFPackage.pdf =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: pdfbox/src/test/resources/org/apache/pdfbox/tika/testPDFPackage.pdf =================================================================== --- pdfbox/src/test/resources/org/apache/pdfbox/tika/testPDFPackage.pdf (revision 1334387) +++ pdfbox/src/test/resources/org/apache/pdfbox/tika/testPDFPackage.pdf (working copy) Property changes on: pdfbox/src/test/resources/org/apache/pdfbox/tika/testPDFPackage.pdf ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +application/octet-stream \ No newline at end of property Index: pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java =================================================================== --- pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (revision 1334387) +++ pdfbox/src/test/java/org/apache/pdfbox/tika/PDFParserTest.java (working copy) @@ -259,6 +259,12 @@ substringCount("
", xml)); } + public void testEmbeddedPDFs() throws Exception { + String xml = getXML("testPDFPackage.pdf").xml; + assertContains("PDF1", xml); + assertContains("PDF2", xml); + } + private static int substringCount(String needle, String haystack) { int upto = -1; int count = 0; @@ -398,11 +404,12 @@ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); handler.setResult(new StreamResult(sw)); - + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); // Try with a document containing various tables and formattings InputStream input = PDFParserTest.class.getResourceAsStream(filename); try { - parser.parse(input, handler, metadata, new ParseContext()); + parser.parse(input, handler, metadata, context); return new XMLResult(sw.toString(), metadata); } finally { input.close(); Index: pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java =================================================================== --- pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (revision 1334387) +++ pdfbox/src/main/java/org/apache/pdfbox/tika/PDFParser.java (working copy) @@ -22,6 +22,7 @@ import java.util.Calendar; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.pdfbox.cos.COSArray; @@ -29,15 +30,24 @@ import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.CloseShieldInputStream; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -47,8 +57,11 @@ * This parser can process also encrypted PDF documents if the required * password is given as a part of the input metadata associated with a * document. If no password is given, then this parser will try decrypting - * the document using the empty password that's often used with PDFs. - * + * the document using the empty password that's often used with PDFs. If + * the PDF contains any embedded documents (for example as part of a PDF + * package) then this parser will use the {@link EmbeddedDocumentExtractor} + * to handle them. + * @since Apache PDFBox 1.7.0 */ public class PDFParser extends AbstractParser { @@ -116,11 +129,56 @@ PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace, suppressDuplicateOverlappingText, sortByPosition); + + extractEmbeddedDocuments(context, pdfDocument, handler); + } finally { pdfDocument.close(); } } + private void extractEmbeddedDocuments(ParseContext context, PDDocument document, ContentHandler handler) + throws IOException, SAXException, TikaException { + + PDDocumentCatalog catalog = document.getDocumentCatalog(); + PDDocumentNameDictionary names = catalog.getNames(); + if (names != null) { + + PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); + if (embeddedFiles != null) { + + EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); + if (embeddedExtractor == null) { + embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); + } + + for (Map.Entry