/* |
= |
/* |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
* contributor license agreements. See the NOTICE file distributed with |
|
* contributor license agreements. See the NOTICE file distributed with |
* this work for additional information regarding copyright ownership. |
|
* this work for additional information regarding copyright ownership. |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
* (the "License"); you may not use this file except in compliance with |
|
* (the "License"); you may not use this file except in compliance with |
* the License. You may obtain a copy of the License at |
|
* the License. You may obtain a copy of the License at |
* |
|
* |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
* |
|
* |
* Unless required by applicable law or agreed to in writing, software |
|
* Unless required by applicable law or agreed to in writing, software |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
* See the License for the specific language governing permissions and |
|
* See the License for the specific language governing permissions and |
* limitations under the License. |
|
* limitations under the License. |
*/ |
|
*/ |
package org.apache.tika.parser.pdf; |
|
package org.apache.tika.parser.pdf; |
|
|
|
import java.io.FileOutputStream; |
+- |
|
import java.io.IOException; |
= |
import java.io.IOException; |
import java.io.InputStream; |
+- |
|
import java.io.Writer; |
= |
import java.io.Writer; |
import java.text.SimpleDateFormat; |
|
import java.text.SimpleDateFormat; |
import java.util.Calendar; |
|
import java.util.Calendar; |
import java.util.Iterator; |
|
import java.util.Iterator; |
import java.util.List; |
|
import java.util.List; |
import java.util.ListIterator; |
|
import java.util.ListIterator; |
import java.util.Map; |
|
import java.util.Map; |
import java.util.TreeMap; |
|
import java.util.TreeMap; |
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument; |
|
import org.apache.pdfbox.pdmodel.PDDocument; |
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; |
|
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; |
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; |
|
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; |
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; |
|
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; |
import org.apache.pdfbox.pdmodel.PDPage; |
|
import org.apache.pdfbox.pdmodel.PDPage; |
import org.apache.pdfbox.pdmodel.common.COSObjectable; |
|
import org.apache.pdfbox.pdmodel.common.COSObjectable; |
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; |
|
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; |
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; |
|
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; |
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; |
|
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; |
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; |
|
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; |
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; |
|
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; |
+- |
|
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; |
|
|
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; |
= |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; |
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; |
|
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; |
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; |
|
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; |
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; |
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; |
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; |
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; |
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; |
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; |
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; |
|
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; |
import org.apache.pdfbox.pdmodel.interactive.form.PDField; |
|
import org.apache.pdfbox.pdmodel.interactive.form.PDField; |
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; |
|
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; |
import org.apache.pdfbox.util.PDFTextStripper; |
|
import org.apache.pdfbox.util.PDFTextStripper; |
import org.apache.pdfbox.util.TextPosition; |
|
import org.apache.pdfbox.util.TextPosition; |
import org.apache.tika.exception.TikaException; |
|
import org.apache.tika.exception.TikaException; |
import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
|
import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; |
|
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; |
import org.apache.tika.io.IOExceptionWithCause; |
|
import org.apache.tika.io.IOExceptionWithCause; |
import org.apache.tika.io.TikaInputStream; |
|
import org.apache.tika.io.TikaInputStream; |
import org.apache.tika.metadata.Metadata; |
|
import org.apache.tika.metadata.Metadata; |
import org.apache.tika.parser.ParseContext; |
|
import org.apache.tika.parser.ParseContext; |
import org.apache.tika.sax.EmbeddedContentHandler; |
|
import org.apache.tika.sax.EmbeddedContentHandler; |
import org.apache.tika.sax.XHTMLContentHandler; |
|
import org.apache.tika.sax.XHTMLContentHandler; |
import org.xml.sax.ContentHandler; |
|
import org.xml.sax.ContentHandler; |
import org.xml.sax.SAXException; |
|
import org.xml.sax.SAXException; |
import org.xml.sax.helpers.AttributesImpl; |
|
import org.xml.sax.helpers.AttributesImpl; |
|
|
|
/** |
|
/** |
* Utility class that overrides the {@link PDFTextStripper} functionality |
|
* Utility class that overrides the {@link PDFTextStripper} functionality |
* to produce a semi-structured XHTML SAX events instead of a plain text |
|
* to produce a semi-structured XHTML SAX events instead of a plain text |
* stream. |
|
* stream. |
*/ |
|
*/ |
class PDF2XHTML extends PDFTextStripper { |
|
class PDF2XHTML extends PDFTextStripper { |
|
|
|
/** |
|
/** |
* format used for signature dates |
|
* format used for signature dates |
*/ |
|
*/ |
private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"); |
|
private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"); |
|
|
|
/** |
|
/** |
* Maximum recursive depth during AcroForm processing. |
|
* Maximum recursive depth during AcroForm processing. |
* Prevents theoretical AcroForm recursion bomb. |
|
* Prevents theoretical AcroForm recursion bomb. |
*/ |
|
*/ |
private final static int MAX_ACROFORM_RECURSIONS = 10; |
|
private final static int MAX_ACROFORM_RECURSIONS = 10; |
|
|
|
|
|
|
// TODO: remove once PDFBOX-1130 is fixed: |
|
// TODO: remove once PDFBOX-1130 is fixed: |
private boolean inParagraph = false; |
|
private boolean inParagraph = false; |
|
|
|
/** |
|
/** |
* Converts the given PDF document (and related metadata) to a stream |
|
* Converts the given PDF document (and related metadata) to a stream |
* of XHTML SAX events sent to the given content handler. |
|
* of XHTML SAX events sent to the given content handler. |
* |
|
* |
* @param document PDF document |
|
* @param document PDF document |
* @param handler SAX content handler |
|
* @param handler SAX content handler |
* @param metadata PDF metadata |
|
* @param metadata PDF metadata |
* @throws SAXException if the content handler fails to process SAX events |
|
* @throws SAXException if the content handler fails to process SAX events |
* @throws TikaException if the PDF document can not be processed |
|
* @throws TikaException if the PDF document can not be processed |
*/ |
|
*/ |
public static void process( |
|
public static void process( |
PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, |
|
PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, |
PDFParserConfig config) |
|
PDFParserConfig config) |
throws SAXException, TikaException { |
|
throws SAXException, TikaException { |
try { |
|
try { |
// Extract text using a dummy Writer as we override the |
|
// Extract text using a dummy Writer as we override the |
// key methods to output to the given content |
|
// key methods to output to the given content |
// handler. |
|
// handler. |
PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); |
|
PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); |
|
|
|
pdf2XHTML.writeText(document, new Writer() { |
|
pdf2XHTML.writeText(document, new Writer() { |
@Override |
|
@Override |
public void write(char[] cbuf, int off, int len) { |
|
public void write(char[] cbuf, int off, int len) { |
} |
|
} |
@Override |
|
@Override |
public void flush() { |
|
public void flush() { |
} |
|
} |
@Override |
|
@Override |
public void close() { |
|
public void close() { |
} |
|
} |
}); |
|
}); |
|
|
|
} catch (IOException e) { |
|
} catch (IOException e) { |
if (e.getCause() instanceof SAXException) { |
|
if (e.getCause() instanceof SAXException) { |
throw (SAXException) e.getCause(); |
|
throw (SAXException) e.getCause(); |
} else { |
|
} else { |
throw new TikaException("Unable to extract PDF content", e); |
|
throw new TikaException("Unable to extract PDF content", e); |
} |
|
} |
} |
|
} |
} |
|
} |
|
|
|
private final ContentHandler originalHandler; |
|
private final ContentHandler originalHandler; |
private final ParseContext context; |
|
private final ParseContext context; |
private final XHTMLContentHandler handler; |
|
private final XHTMLContentHandler handler; |
private final PDFParserConfig config; |
|
private final PDFParserConfig config; |
|
|
|
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, |
|
private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, |
PDFParserConfig config) |
|
PDFParserConfig config) |
throws IOException { |
|
throws IOException { |
//source of config (derives from context or PDFParser?) is |
|
//source of config (derives from context or PDFParser?) is |
//already determined in PDFParser. No need to check context here. |
|
//already determined in PDFParser. No need to check context here. |
this.config = config; |
|
this.config = config; |
this.originalHandler = handler; |
|
this.originalHandler = handler; |
this.context = context; |
|
this.context = context; |
this.handler = new XHTMLContentHandler(handler, metadata); |
|
this.handler = new XHTMLContentHandler(handler, metadata); |
setForceParsing(true); |
|
setForceParsing(true); |
setSortByPosition(config.getSortByPosition()); |
|
setSortByPosition(config.getSortByPosition()); |
if (config.getEnableAutoSpace()) { |
|
if (config.getEnableAutoSpace()) { |
setWordSeparator(" "); |
|
setWordSeparator(" "); |
} else { |
|
} else { |
setWordSeparator(""); |
|
setWordSeparator(""); |
} |
|
} |
// TODO: maybe expose setting these too: |
|
// TODO: maybe expose setting these too: |
//setAverageCharTolerance(1.0f); |
|
//setAverageCharTolerance(1.0f); |
//setSpacingTolerance(1.0f); |
|
//setSpacingTolerance(1.0f); |
setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText()); |
|
setSuppressDuplicateOverlappingText(config.getSuppressDuplicateOverlappingText()); |
} |
|
} |
|
|
|
void extractBookmarkText() throws SAXException { |
|
void extractBookmarkText() throws SAXException { |
PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); |
|
PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); |
if (outline != null) { |
|
if (outline != null) { |
extractBookmarkText(outline); |
|
extractBookmarkText(outline); |
} |
|
} |
} |
|
} |
|
|
|
void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { |
|
void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { |
PDOutlineItem current = bookmark.getFirstChild(); |
|
PDOutlineItem current = bookmark.getFirstChild(); |
if (current != null) { |
|
if (current != null) { |
handler.startElement("ul"); |
|
handler.startElement("ul"); |
while (current != null) { |
|
while (current != null) { |
handler.startElement("li"); |
|
handler.startElement("li"); |
handler.characters(current.getTitle()); |
|
handler.characters(current.getTitle()); |
handler.endElement("li"); |
|
handler.endElement("li"); |
// Recurse: |
|
// Recurse: |
extractBookmarkText(current); |
|
extractBookmarkText(current); |
current = current.getNextSibling(); |
|
current = current.getNextSibling(); |
} |
|
} |
handler.endElement("ul"); |
|
handler.endElement("ul"); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void startDocument(PDDocument pdf) throws IOException { |
|
protected void startDocument(PDDocument pdf) throws IOException { |
try { |
|
try { |
handler.startDocument(); |
|
handler.startDocument(); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to start a document", e); |
|
throw new IOExceptionWithCause("Unable to start a document", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void endDocument(PDDocument pdf) throws IOException { |
|
protected void endDocument(PDDocument pdf) throws IOException { |
try { |
|
try { |
// Extract text for any bookmarks: |
|
// Extract text for any bookmarks: |
extractBookmarkText(); |
|
extractBookmarkText(); |
extractEmbeddedDocuments(pdf, originalHandler); |
|
extractEmbeddedDocuments(pdf, originalHandler); |
|
|
|
//extract acroform data at end of doc |
|
//extract acroform data at end of doc |
if (config.getExtractAcroFormContent() == true){ |
|
if (config.getExtractAcroFormContent() == true){ |
extractAcroForm(pdf, handler); |
|
extractAcroForm(pdf, handler); |
} |
|
} |
handler.endDocument(); |
|
handler.endDocument(); |
} catch (TikaException e){ |
|
} catch (TikaException e){ |
throw new IOExceptionWithCause("Unable to end a document", e); |
|
throw new IOExceptionWithCause("Unable to end a document", e); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to end a document", e); |
|
throw new IOExceptionWithCause("Unable to end a document", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void startPage(PDPage page) throws IOException { |
|
protected void startPage(PDPage page) throws IOException { |
try { |
|
try { |
handler.startElement("div", "class", "page"); |
|
handler.startElement("div", "class", "page"); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to start a page", e); |
|
throw new IOExceptionWithCause("Unable to start a page", e); |
} |
|
} |
writeParagraphStart(); |
|
writeParagraphStart(); |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void endPage(PDPage page) throws IOException { |
|
protected void endPage(PDPage page) throws IOException { |
|
|
|
try { |
|
try { |
writeParagraphEnd(); |
|
writeParagraphEnd(); |
// TODO: remove once PDFBOX-1143 is fixed: |
|
// TODO: remove once PDFBOX-1143 is fixed: |
if (config.getExtractAnnotationText()) { |
|
if (config.getExtractAnnotationText()) { |
for(Object o : page.getAnnotations()) { |
|
for(Object o : page.getAnnotations()) { |
if( o instanceof PDAnnotationLink ) { |
|
if( o instanceof PDAnnotationLink ) { |
PDAnnotationLink annotationlink = (PDAnnotationLink) o; |
|
PDAnnotationLink annotationlink = (PDAnnotationLink) o; |
if (annotationlink.getAction() != null) { |
|
if (annotationlink.getAction() != null) { |
PDAction action = annotationlink.getAction(); |
|
PDAction action = annotationlink.getAction(); |
if( action instanceof PDActionURI ) { |
|
if( action instanceof PDActionURI ) { |
PDActionURI uri = (PDActionURI) action; |
|
PDActionURI uri = (PDActionURI) action; |
String link = uri.getURI(); |
|
String link = uri.getURI(); |
if (link != null) { |
|
if (link != null) { |
handler.startElement("div", "class", "annotation"); |
|
handler.startElement("div", "class", "annotation"); |
handler.startElement("a", "href", link); |
|
handler.startElement("a", "href", link); |
handler.endElement("a"); |
|
handler.endElement("a"); |
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
|
|
|
if (o instanceof PDAnnotationMarkup) { |
|
if (o instanceof PDAnnotationMarkup) { |
PDAnnotationMarkup annot = (PDAnnotationMarkup) o; |
|
PDAnnotationMarkup annot = (PDAnnotationMarkup) o; |
String title = annot.getTitlePopup(); |
|
String title = annot.getTitlePopup(); |
String subject = annot.getSubject(); |
|
String subject = annot.getSubject(); |
String contents = annot.getContents(); |
|
String contents = annot.getContents(); |
// TODO: maybe also annot.getRichContents()? |
|
// TODO: maybe also annot.getRichContents()? |
if (title != null || subject != null || contents != null) { |
|
if (title != null || subject != null || contents != null) { |
handler.startElement("div", "class", "annotation"); |
|
handler.startElement("div", "class", "annotation"); |
|
|
|
if (title != null) { |
|
if (title != null) { |
handler.startElement("div", "class", "annotationTitle"); |
|
handler.startElement("div", "class", "annotationTitle"); |
handler.characters(title); |
|
handler.characters(title); |
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
|
|
|
if (subject != null) { |
|
if (subject != null) { |
handler.startElement("div", "class", "annotationSubject"); |
|
handler.startElement("div", "class", "annotationSubject"); |
handler.characters(subject); |
|
handler.characters(subject); |
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
|
|
|
if (contents != null) { |
|
if (contents != null) { |
handler.startElement("div", "class", "annotationContents"); |
|
handler.startElement("div", "class", "annotationContents"); |
handler.characters(contents); |
|
handler.characters(contents); |
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
|
|
|
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
handler.endElement("div"); |
|
handler.endElement("div"); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to end a page", e); |
|
throw new IOExceptionWithCause("Unable to end a page", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeParagraphStart() throws IOException { |
|
protected void writeParagraphStart() throws IOException { |
// TODO: remove once PDFBOX-1130 is fixed |
|
// TODO: remove once PDFBOX-1130 is fixed |
if (inParagraph) { |
|
if (inParagraph) { |
// Close last paragraph |
|
// Close last paragraph |
writeParagraphEnd(); |
|
writeParagraphEnd(); |
} |
|
} |
assert !inParagraph; |
|
assert !inParagraph; |
inParagraph = true; |
|
inParagraph = true; |
try { |
|
try { |
handler.startElement("p"); |
|
handler.startElement("p"); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to start a paragraph", e); |
|
throw new IOExceptionWithCause("Unable to start a paragraph", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeParagraphEnd() throws IOException { |
|
protected void writeParagraphEnd() throws IOException { |
// TODO: remove once PDFBOX-1130 is fixed |
|
// TODO: remove once PDFBOX-1130 is fixed |
if (!inParagraph) { |
|
if (!inParagraph) { |
writeParagraphStart(); |
|
writeParagraphStart(); |
} |
|
} |
assert inParagraph; |
|
assert inParagraph; |
inParagraph = false; |
|
inParagraph = false; |
try { |
|
try { |
handler.endElement("p"); |
|
handler.endElement("p"); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause("Unable to end a paragraph", e); |
|
throw new IOExceptionWithCause("Unable to end a paragraph", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeString(String text) throws IOException { |
|
protected void writeString(String text) throws IOException { |
try { |
|
try { |
handler.characters(text); |
|
handler.characters(text); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause( |
|
throw new IOExceptionWithCause( |
"Unable to write a string: " + text, e); |
|
"Unable to write a string: " + text, e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeCharacters(TextPosition text) throws IOException { |
|
protected void writeCharacters(TextPosition text) throws IOException { |
try { |
|
try { |
handler.characters(text.getCharacter()); |
|
handler.characters(text.getCharacter()); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause( |
|
throw new IOExceptionWithCause( |
"Unable to write a character: " + text.getCharacter(), e); |
|
"Unable to write a character: " + text.getCharacter(), e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeWordSeparator() throws IOException { |
|
protected void writeWordSeparator() throws IOException { |
try { |
|
try { |
handler.characters(getWordSeparator()); |
|
handler.characters(getWordSeparator()); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause( |
|
throw new IOExceptionWithCause( |
"Unable to write a space character", e); |
|
"Unable to write a space character", e); |
} |
|
} |
} |
|
} |
|
|
|
@Override |
|
@Override |
protected void writeLineSeparator() throws IOException { |
|
protected void writeLineSeparator() throws IOException { |
try { |
|
try { |
handler.newline(); |
|
handler.newline(); |
} catch (SAXException e) { |
|
} catch (SAXException e) { |
throw new IOExceptionWithCause( |
|
throw new IOExceptionWithCause( |
"Unable to write a newline character", e); |
|
"Unable to write a newline character", e); |
} |
|
} |
} |
|
} |
|
|
|
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) |
|
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) |
throws IOException, SAXException, TikaException { |
|
throws IOException, SAXException, TikaException { |
// PDDocumentCatalog catalog = document.getDocumentCatalog(); |
<> |
PDDocumentCatalog catalog = document.getDocumentCatalog(); |
// PDDocumentNameDictionary names = catalog.getNames(); |
|
PDDocumentNameDictionary names = catalog.getNames(); |
// if (names == null){ |
|
if (names == null){ |
// return; |
|
return; |
// } |
|
} |
// PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); |
|
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); |
|
|
|
PDEmbeddedFile embeddedFile = null; |
|
|
EmbeddedDocumentExtractor embeddedExtractor = null; |
|
|
Metadata metadata = null; |
|
|
boolean outputHtml = true; |
|
|
InputStream stream = null; |
|
|
|
|
|
List<PDPage> allPages = document.getDocumentCatalog().getAllPages(); |
|
|
for (PDPage pdPage : allPages) |
|
|
{ |
|
|
List<PDAnnotation> annotations = pdPage.getAnnotations(); |
|
|
for (PDAnnotation ann : annotations) |
|
|
{ |
|
|
if (ann instanceof PDAnnotationFileAttachment) |
|
|
{ |
|
|
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) ann; |
|
|
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); |
|
|
embeddedFile = fileSpec.getEmbeddedFile(); |
|
|
|
|
|
if (embeddedFile == null) { |
|
if (embeddedFiles == null) { |
return; |
|
return; |
} |
|
} |
|
|
|
embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); |
|
EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class); |
if (embeddedExtractor == null) { |
|
if (embeddedExtractor == null) { |
embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); |
|
embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); |
} |
|
} |
|
|
|
metadata = new Metadata(); |
|
|
outputHtml = true; |
|
|
stream = embeddedFile.createInputStream(); |
|
|
try{ |
|
|
embeddedExtractor.parseEmbedded(stream, handler, metadata, outputHtml); |
|
|
}finally{ |
|
|
stream.close(); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); |
|
Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); |
// //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. |
|
//For now, try to get the embeddedFileNames out of embeddedFiles or its kids. |
// //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java |
|
//This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java |
// //If there is a need we could add a fully recursive search to find a non-null |
|
//If there is a need we could add a fully recursive search to find a non-null |
// //Map<String, COSObjectable> that contains the doc info. |
|
//Map<String, COSObjectable> that contains the doc info. |
// if (embeddedFileNames != null){ |
|
if (embeddedFileNames != null){ |
// processEmbeddedDocNames(embeddedFileNames, embeddedExtractor); |
|
processEmbeddedDocNames(embeddedFileNames, embeddedExtractor); |
// } else { |
|
} else { |
// List<PDNameTreeNode> kids = embeddedFiles.getKids(); |
|
List<PDNameTreeNode> kids = embeddedFiles.getKids(); |
// if (kids == null){ |
|
if (kids == null){ |
// return; |
|
return; |
// } |
|
} |
// for (PDNameTreeNode n : kids){ |
|
for (PDNameTreeNode n : kids){ |
// Map<String, COSObjectable> childNames = n.getNames(); |
|
Map<String, COSObjectable> childNames = n.getNames(); |
// if (childNames != null){ |
|
if (childNames != null){ |
// processEmbeddedDocNames(childNames, embeddedExtractor); |
|
processEmbeddedDocNames(childNames, embeddedExtractor); |
// } |
|
} |
// } |
|
} |
// } |
|
} |
} |
= |
} |
|
|
|
|
|
|
@SuppressWarnings("unused") |
<> |
|
private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, |
|
private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames, |
EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { |
= |
EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException { |
if (embeddedFileNames == null){ |
|
if (embeddedFileNames == null){ |
return; |
|
return; |
} |
|
} |
for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) { |
|
for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) { |
PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); |
|
PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); |
PDEmbeddedFile file = spec.getEmbeddedFile(); |
|
PDEmbeddedFile file = spec.getEmbeddedFile(); |
|
|
|
Metadata metadata = new Metadata(); |
|
Metadata metadata = new Metadata(); |
// TODO: other metadata? |
|
// TODO: other metadata? |
metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); |
|
metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); |
metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); |
|
metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); |
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); |
|
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); |
|
|
|
if (embeddedExtractor.shouldParseEmbedded(metadata)) { |
|
if (embeddedExtractor.shouldParseEmbedded(metadata)) { |
TikaInputStream stream = TikaInputStream.get(file.createInputStream()); |
|
TikaInputStream stream = TikaInputStream.get(file.createInputStream()); |
try { |
|
try { |
embeddedExtractor.parseEmbedded( |
|
embeddedExtractor.parseEmbedded( |
stream, |
|
stream, |
new EmbeddedContentHandler(handler), |
|
new EmbeddedContentHandler(handler), |
metadata, false); |
|
metadata, false); |
} finally { |
|
} finally { |
stream.close(); |
|
stream.close(); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, |
|
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, |
SAXException { |
|
SAXException { |
//Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields |
|
//Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields |
//this code derives from Ben's code |
|
//this code derives from Ben's code |
PDDocumentCatalog catalog = pdf.getDocumentCatalog(); |
|
PDDocumentCatalog catalog = pdf.getDocumentCatalog(); |
|
|
|
if (catalog == null) |
|
if (catalog == null) |
return; |
|
return; |
|
|
|
PDAcroForm form = catalog.getAcroForm(); |
|
PDAcroForm form = catalog.getAcroForm(); |
if (form == null) |
|
if (form == null) |
return; |
|
return; |
|
|
|
@SuppressWarnings("rawtypes") |
|
@SuppressWarnings("rawtypes") |
List fields = form.getFields(); |
|
List fields = form.getFields(); |
|
|
|
if (fields == null) |
|
if (fields == null) |
return; |
|
return; |
|
|
|
@SuppressWarnings("rawtypes") |
|
@SuppressWarnings("rawtypes") |
ListIterator itr = fields.listIterator(); |
|
ListIterator itr = fields.listIterator(); |
|
|
|
if (itr == null) |
|
if (itr == null) |
return; |
|
return; |
|
|
|
handler.startElement("div", "class", "acroform"); |
|
handler.startElement("div", "class", "acroform"); |
handler.startElement("ol"); |
|
handler.startElement("ol"); |
while (itr.hasNext()){ |
|
while (itr.hasNext()){ |
Object obj = itr.next(); |
|
Object obj = itr.next(); |
if (obj != null && obj instanceof PDField){ |
|
if (obj != null && obj instanceof PDField){ |
processAcroField((PDField)obj, handler, 0); |
|
processAcroField((PDField)obj, handler, 0); |
} |
|
} |
} |
|
} |
handler.endElement("ol"); |
|
handler.endElement("ol"); |
handler.endElement("div"); |
|
handler.endElement("div"); |
} |
|
} |
|
|
|
private void processAcroField(PDField field, XHTMLContentHandler handler, final int recurseDepth) |
|
private void processAcroField(PDField field, XHTMLContentHandler handler, final int recurseDepth) |
throws SAXException, IOException { |
|
throws SAXException, IOException { |
|
|
|
if (recurseDepth >= MAX_ACROFORM_RECURSIONS){ |
|
if (recurseDepth >= MAX_ACROFORM_RECURSIONS){ |
return; |
|
return; |
} |
|
} |
|
|
|
addFieldString(field, handler); |
|
addFieldString(field, handler); |
|
|
|
@SuppressWarnings("rawtypes") |
|
@SuppressWarnings("rawtypes") |
List kids = field.getKids(); |
|
List kids = field.getKids(); |
if(kids != null){ |
|
if(kids != null){ |
|
|
|
@SuppressWarnings("rawtypes") |
|
@SuppressWarnings("rawtypes") |
Iterator kidsIter = kids.iterator(); |
|
Iterator kidsIter = kids.iterator(); |
if (kidsIter == null){ |
|
if (kidsIter == null){ |
return; |
|
return; |
} |
|
} |
int r = recurseDepth+1; |
|
int r = recurseDepth+1; |
handler.startElement("ol"); |
|
handler.startElement("ol"); |
while(kidsIter.hasNext()){ |
|
while(kidsIter.hasNext()){ |
Object pdfObj = kidsIter.next(); |
|
Object pdfObj = kidsIter.next(); |
if(pdfObj != null && pdfObj instanceof PDField){ |
|
if(pdfObj != null && pdfObj instanceof PDField){ |
PDField kid = (PDField)pdfObj; |
|
PDField kid = (PDField)pdfObj; |
//recurse |
|
//recurse |
processAcroField(kid, handler, r); |
|
processAcroField(kid, handler, r); |
} |
|
} |
} |
|
} |
handler.endElement("ol"); |
|
handler.endElement("ol"); |
} |
|
} |
} |
|
} |
private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException{ |
|
private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException{ |
//Pick partial name to present in content and altName for attribute |
|
//Pick partial name to present in content and altName for attribute |
//Ignoring FullyQualifiedName for now |
|
//Ignoring FullyQualifiedName for now |
String partName = field.getPartialName(); |
|
String partName = field.getPartialName(); |
String altName = field.getAlternateFieldName(); |
|
String altName = field.getAlternateFieldName(); |
|
|
|
StringBuilder sb = new StringBuilder(); |
|
StringBuilder sb = new StringBuilder(); |
AttributesImpl attrs = new AttributesImpl(); |
|
AttributesImpl attrs = new AttributesImpl(); |
|
|
|
if (partName != null){ |
|
if (partName != null){ |
sb.append(partName).append(": "); |
|
sb.append(partName).append(": "); |
} |
|
} |
if (altName != null){ |
|
if (altName != null){ |
attrs.addAttribute("", "altName", "altName", "CDATA", altName); |
|
attrs.addAttribute("", "altName", "altName", "CDATA", altName); |
} |
|
} |
//return early if PDSignature field |
|
//return early if PDSignature field |
if (field instanceof PDSignatureField){ |
|
if (field instanceof PDSignatureField){ |
handleSignature(attrs, (PDSignatureField)field, handler); |
|
handleSignature(attrs, (PDSignatureField)field, handler); |
return; |
|
return; |
} |
|
} |
try { |
|
try { |
//getValue can throw an IOException if there is no value |
|
//getValue can throw an IOException if there is no value |
String value = field.getValue(); |
|
String value = field.getValue(); |
if (value != null && ! value.equals("null")){ |
|
if (value != null && ! value.equals("null")){ |
sb.append(value); |
|
sb.append(value); |
} |
|
} |
} catch (IOException e) { |
|
} catch (IOException e) { |
//swallow |
|
//swallow |
} |
|
} |
|
|
|
if (attrs.getLength() > 0 || sb.length() > 0){ |
|
if (attrs.getLength() > 0 || sb.length() > 0){ |
handler.startElement("li", attrs); |
|
handler.startElement("li", attrs); |
handler.characters(sb.toString()); |
|
handler.characters(sb.toString()); |
handler.endElement("li"); |
|
handler.endElement("li"); |
} |
|
} |
} |
|
} |
|
|
|
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, |
|
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, |
XHTMLContentHandler handler) throws SAXException{ |
|
XHTMLContentHandler handler) throws SAXException{ |
|
|
|
|
|
|
PDSignature sig = sigField.getSignature(); |
|
PDSignature sig = sigField.getSignature(); |
if (sig == null){ |
|
if (sig == null){ |
return; |
|
return; |
} |
|
} |
Map<String, String> vals= new TreeMap<String, String>(); |
|
Map<String, String> vals= new TreeMap<String, String>(); |
vals.put("name", sig.getName()); |
|
vals.put("name", sig.getName()); |
vals.put("contactInfo", sig.getContactInfo()); |
|
vals.put("contactInfo", sig.getContactInfo()); |
vals.put("location", sig.getLocation()); |
|
vals.put("location", sig.getLocation()); |
vals.put("reason", sig.getReason()); |
|
vals.put("reason", sig.getReason()); |
|
|
|
Calendar cal = sig.getSignDate(); |
|
Calendar cal = sig.getSignDate(); |
if (cal != null){ |
|
if (cal != null){ |
dateFormat.setTimeZone(cal.getTimeZone()); |
|
dateFormat.setTimeZone(cal.getTimeZone()); |
vals.put("date", dateFormat.format(cal.getTime())); |
|
vals.put("date", dateFormat.format(cal.getTime())); |
} |
|
} |
//see if there is any data |
|
//see if there is any data |
int nonNull = 0; |
|
int nonNull = 0; |
for (String val : vals.keySet()){ |
|
for (String val : vals.keySet()){ |
if (val != null && ! val.equals("")){ |
|
if (val != null && ! val.equals("")){ |
nonNull++; |
|
nonNull++; |
} |
|
} |
} |
|
} |
//if there is, process it |
|
//if there is, process it |
if (nonNull > 0){ |
|
if (nonNull > 0){ |
handler.startElement("li", parentAttributes); |
|
handler.startElement("li", parentAttributes); |
|
|
|
AttributesImpl attrs = new AttributesImpl(); |
|
AttributesImpl attrs = new AttributesImpl(); |
attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); |
|
attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); |
|
|
|
handler.startElement("ol", attrs); |
|
handler.startElement("ol", attrs); |
for (Map.Entry<String, String> e : vals.entrySet()){ |
|
for (Map.Entry<String, String> e : vals.entrySet()){ |
if (e.getValue() == null || e.getValue().equals("")){ |
|
if (e.getValue() == null || e.getValue().equals("")){ |
continue; |
|
continue; |
} |
|
} |
attrs = new AttributesImpl(); |
|
attrs = new AttributesImpl(); |
attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); |
|
attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); |
handler.startElement("li", attrs); |
|
handler.startElement("li", attrs); |
handler.characters(e.getValue()); |
|
handler.characters(e.getValue()); |
handler.endElement("li"); |
|
handler.endElement("li"); |
} |
|
} |
handler.endElement("ol"); |
|
handler.endElement("ol"); |
handler.endElement("li"); |
|
handler.endElement("li"); |
} |
|
} |
} |
|
} |
} |
|
} |