Index: tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (revision 1627940)
+++ tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (working copy)
@@ -50,6 +50,11 @@
private int writeCount = 0;
/**
+ * Flag to mark if the limit has been reached
+ */
+ private boolean writeLimitReached = false;
+
+ /**
* Creates a content handler that writes content up to the given
* write limit to the given content handler.
*
@@ -138,6 +143,7 @@
} else {
super.characters(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
+ writeLimitReached = true;
throw new WriteLimitReachedException(
"Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
@@ -156,6 +162,7 @@
} else {
super.ignorableWhitespace(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
+ writeLimitReached = true;
throw new WriteLimitReachedException(
"Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
@@ -173,31 +180,26 @@
* @param t throwable
* @return true
if the write limit was reached,
* false
otherwise
+ *
+ * Deprecated in Tika 1.6, use isWriteLimitReached(); the current
+ * implementation ignores the given Throwable and is equivalent to
+ * isWriteLimitReached()
+ *
*/
+ @Deprecated
public boolean isWriteLimitReached(Throwable t) {
- if (t instanceof WriteLimitReachedException) {
- return tag.equals(((WriteLimitReachedException) t).tag);
- } else {
- return t.getCause() != null && isWriteLimitReached(t.getCause());
- }
+ return isWriteLimitReached();
}
-
+
/**
- * The exception used as a signal when the write limit has been reached.
+ * Returns true if the limit has been reached, false otherwise.
+ *
+ * @since Apache Tika 1.6
+ * @return true
if the write limit was reached,
+ * false
otherwise
*/
- private static class WriteLimitReachedException extends SAXException {
-
- /** Serial version UID */
- private static final long serialVersionUID = -1850581945459429943L;
-
- /** Serializable tag of the handler that caused this exception */
- private final Serializable tag;
-
- public WriteLimitReachedException(String message, Serializable tag) {
- super(message);
- this.tag = tag;
- }
-
+ public boolean isWriteLimitReached() {
+ return writeLimitReached;
}
}
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (revision 1627940)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (working copy)
@@ -52,6 +52,7 @@
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.WriteLimitReachedException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -157,7 +158,13 @@
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
if (handler != null) {
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ try {
+ PDF2Text.process(pdfDocument, handler, context, metadata, localConfig);
+ } catch (WriteLimitReachedException x) {
+ //
+ // This is a valid condition; just ignoring the exception
+ //
+ }
}
} finally {
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (revision 1627940)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (working copy)
@@ -144,21 +144,21 @@
*
* @param pdf2XHTML
*/
- public void configure(PDF2XHTML pdf2XHTML) {
- pdf2XHTML.setForceParsing(true);
- pdf2XHTML.setSortByPosition(getSortByPosition());
+ public void configure(PDF2Text pdf2text) {
+ pdf2text.setForceParsing(true);
+ pdf2text.setSortByPosition(getSortByPosition());
if (getEnableAutoSpace()) {
- pdf2XHTML.setWordSeparator(" ");
+ pdf2text.setWordSeparator(" ");
} else {
- pdf2XHTML.setWordSeparator("");
+ pdf2text.setWordSeparator("");
}
if (getAverageCharTolerance() != null) {
- pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
+ pdf2text.setAverageCharTolerance(getAverageCharTolerance());
}
if (getSpacingTolerance() != null) {
- pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
+ pdf2text.setSpacingTolerance(getSpacingTolerance());
}
- pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
+ pdf2text.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}
Index: tika-parsers/src/test/java/org/apache/tika/TikaTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/TikaTest.java (revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/TikaTest.java (working copy)
@@ -16,9 +16,7 @@
*/
package org.apache.tika;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
+
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -38,6 +36,9 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -102,6 +103,10 @@
protected XMLResult getXML(String filePath) throws Exception {
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
}
+
+ protected String getText(String filePath) throws Exception {
+ return getText(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
+ }
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
ParseContext context = new ParseContext();
Index: tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (working copy)
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.pdf;
+
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
@@ -910,7 +911,30 @@
}
-
+ @Test
+ public void testLimitTextToParse() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+
+ new PDFParser().parse(
+ getResourceAsStream("/test-documents/testPDF.pdf"),
+ handler,
+ new Metadata(),
+ new ParseContext()
+ );
+
+ assertEquals(1067, handler.toString().length());
+
+ handler = new BodyContentHandler(500);
+
+ new PDFParser().parse(
+ getResourceAsStream("/test-documents/testPDF.pdf"),
+ handler,
+ new Metadata(),
+ new ParseContext()
+ );
+
+ assertEquals(500, handler.toString().length());
+ }
@Test
public void testInlineConfig() throws Exception {
Index: tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (revision 1627940)
+++ tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (working copy)
@@ -91,7 +91,7 @@
@Test
public void testUmlautSpacesExtraction2() throws Exception {
- String content = getText("testRTFUmlautSpaces2.rtf");
+ String content = getResultText("testRTFUmlautSpaces2.rtf");
content = content.replaceAll("\\s+", "");
assertEquals("\u00DCbersicht", content);
}
@@ -98,7 +98,7 @@
@Test
public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
- String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+ String content = getResultText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
assertContains("\u5E74", content);
assertContains("\u5ff5", content);
@@ -109,13 +109,13 @@
@Test
public void testHexEscapeInsideWord() throws Exception {
- String content = getText("testRTFHexEscapeInsideWord.rtf");
+ String content = getResultText("testRTFHexEscapeInsideWord.rtf");
assertContains("ESP\u00cdRITO", content);
}
@Test
public void testWindowsCodepage1250() throws Exception {
- String content = getText("testRTFWindowsCodepage1250.rtf");
+ String content = getResultText("testRTFWindowsCodepage1250.rtf");
assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
}
@@ -131,7 +131,7 @@
@Test
public void testTableCellSeparation2() throws Exception {
- String content = getText("testRTFTableCellSeparation2.rtf");
+ String content = getResultText("testRTFTableCellSeparation2.rtf");
// TODO: why do we insert extra whitespace...?
content = content.replaceAll("\\s+"," ");
assertContains("Station Fax", content);
@@ -175,7 +175,7 @@
@Test
public void testGothic() throws Exception {
- String content = getText("testRTFUnicodeGothic.rtf");
+ String content = getResultText("testRTFUnicodeGothic.rtf");
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
@@ -231,7 +231,7 @@
@Test
public void testTextWithCurlyBraces() throws Exception {
- String content = getText("testRTFWithCurlyBraces.rtf");
+ String content = getResultText("testRTFWithCurlyBraces.rtf");
assertContains("{ some text inside curly brackets }", content);
}
@@ -597,7 +597,8 @@
return new Result(content, metadata);
}
- private String getText(String filename) throws Exception {
+ private String getResultText(String filename) throws Exception {
return getResult(filename).text;
}
}
+
\ No newline at end of file