Index: tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (working copy)
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.io.File;
+import java.io.Serializable;
+
+/**
+ * Configuration for TesseractOCRParser.
+ *
+ * This allows to enable TesseractOCRParser and set its parameters:
+ *
+ * TesseractOCRConfig config = new TesseractOCRConfig();
+ * config.setTesseractPath(tesseractFolder);
+ * parseContext.set(TesseractOCRConfig.class, config);
+ *
+ *
+ *
+ */
+public class TesseractOCRConfig implements Serializable{
+
+ private static final long serialVersionUID = -4861942486845757891L;
+
+ // Path to tesseract installation folder, if not on system path.
+ private String tesseractPath = "";
+
+ // Language dictionary to be used.
+ private String language = "eng";
+
+ // Tesseract page segmentation mode.
+ private String pageSegMode = "1";
+
+ // Minimum file size to submit file to ocr.
+ private int minFileSizeToOcr = 0;
+
+ // Maximum file size to submit file to ocr.
+ private int maxFileSizeToOcr = Integer.MAX_VALUE;
+
+ // Maximum time (seconds) to wait for the ocring process termination
+ private int timeout = 120;
+
+ /** @see #setTesseractPath(String tesseractPath)*/
+ public String getTesseractPath() {
+ return tesseractPath;
+ }
+
+ /**
+ * Set tesseract installation folder, needed if it is not on system path.
+ */
+ public void setTesseractPath(String tesseractPath) {
+ if(!tesseractPath.endsWith(File.separator))
+ tesseractPath += File.separator;
+
+ this.tesseractPath = tesseractPath;
+ }
+
+ /** @see #setLanguage(String language)*/
+ public String getLanguage() {
+ return language;
+ }
+
+ /**
+ * Set tesseract language dictionary to be used. Default is "eng".
+ * Multiple languages may be specified, separated by plus characters.
+ */
+ public void setLanguage(String language) {
+ this.language = language;
+ }
+
+ /** @see #setPageSegMode(String pageSegMode)*/
+ public String getPageSegMode() {
+ return pageSegMode;
+ }
+
+ /**
+ * Set tesseract page segmentation mode.
+ * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+ */
+ public void setPageSegMode(String pageSegMode) {
+ this.pageSegMode = pageSegMode;
+ }
+
+ /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+ public int getMinFileSizeToOcr() {
+ return minFileSizeToOcr;
+ }
+
+ /**
+ * Set minimum file size to submit file to ocr.
+ * Default is 0.
+ */
+ public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+ this.minFileSizeToOcr = minFileSizeToOcr;
+ }
+
+ /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+ public int getMaxFileSizeToOcr() {
+ return maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum file size to submit file to ocr.
+ * Default is Integer.MAX_VALUE.
+ */
+ public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+ this.maxFileSizeToOcr = maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum time (seconds) to wait for the ocring process to terminate.
+ * Default value is 120s.
+ */
+ public void setTimeout(int timeout) {
+ this.timeout = timeout;
+ }
+
+ /** @see #setTimeout(int timeout)*/
+ public int getTimeout() {
+ return timeout;
+ }
+
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (working copy)
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import java.awt.Graphics2D;
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import javax.imageio.ImageIO;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * TesseractOCRParser powered by tesseract-ocr engine.
+ * To enable this parser, create a {@link TesseractOCRConfig}
+ * object and pass it through a ParseContext.
+ * Tesseract-ocr must be installed and on system path or
+ * the path to its root folder must be provided:
+ *
+ * TesseractOCRConfig config = new TesseractOCRConfig();
+ * //Needed if tesseract is not on system path
+ * config.setTesseractPath(tesseractFolder);
+ * parseContext.set(TesseractOCRConfig.class, config);
+ *
+ *
+ *
+ */
+public class TesseractOCRParser extends AbstractParser {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Set SUPPORTED_TYPES = getTypes();
+
+ private static Set getTypes() {
+ HashSet supportedTypes = new HashSet();
+
+ supportedTypes.add(MediaType.image("png"));
+ supportedTypes.add(MediaType.image("jpeg"));
+ supportedTypes.add(MediaType.image("tiff"));
+ supportedTypes.add(MediaType.image("x-ms-bmp"));
+ supportedTypes.add(MediaType.image("gif"));
+
+ return supportedTypes;
+ }
+
+ @Override
+ public Set getSupportedTypes(ParseContext arg0) {
+ return SUPPORTED_TYPES;
+ }
+
+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+ if(!config.getTesseractPath().isEmpty()){
+ Map env = pb.environment();
+ env.put("TESSDATA_PREFIX", config.getTesseractPath());
+ }
+ }
+
+ public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ TemporaryResources tmp = new TemporaryResources();
+ FileOutputStream fos = null;
+ TikaInputStream tis = null;
+ try{
+ int w = image.getWidth(null);
+ int h = image.getHeight(null);
+ BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+ Graphics2D g2 = bImage.createGraphics();
+ g2.drawImage(image, 0, 0, null);
+ g2.dispose();
+ File file = tmp.createTemporaryFile();
+ fos = new FileOutputStream(file);
+ ImageIO.write(bImage, "png", fos);
+ bImage = null;
+ tis = TikaInputStream.get(file);
+ parse(tis, handler, metadata, context);
+
+ }finally{
+ tmp.dispose();
+ if(tis != null)
+ tis.close();
+ if(fos != null)
+ fos.close();
+ }
+
+
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+ if(config == null) config = new TesseractOCRConfig();
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TemporaryResources tmp = new TemporaryResources();
+ File output = null;
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ File input = tikaStream.getFile();
+ long size = tikaStream.getLength();
+
+ if(size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()){
+
+ output = tmp.createTemporaryFile();
+ doOCR(input, output, config);
+
+ //Tesseract appends .txt to output file name
+ output = new File(output.getAbsolutePath() + ".txt");
+
+ if(output.exists())
+ extractOutput(new FileInputStream(output), xhtml);
+
+ }
+
+ } finally {
+ tmp.dispose();
+ if(output != null)
+ output.delete();
+
+ }
+ xhtml.endDocument();
+ }
+
+ /**
+ * Run external tesseract-ocr process.
+ * @param input File to be ocred
+ * @param output File to collect ocr result
+ * @param config Configuration of tesseract-ocr engine
+ * @throws TikaException if the extraction timed out
+ * @throws IOException if an input error occurred
+ */
+ private void doOCR(File input, File output, TesseractOCRConfig config)
+ throws IOException, TikaException {
+ String[] cmd = {config.getTesseractPath() + "tesseract",
+ input.getPath(),
+ output.getPath() ,
+ "-l",
+ config.getLanguage() ,
+ "-psm",
+ config.getPageSegMode() };
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ setEnv(config, pb);
+ final Process process = pb.start();
+
+ process.getOutputStream().close();
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ logStream("OCR MSG", out, input);
+ logStream("OCR ERROR", err, input);
+
+ FutureTask waitTask = new FutureTask(new Callable() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
+ try {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+ } catch (InterruptedException e) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException("TesseractOCRParser interrupted", e);
+
+ } catch (ExecutionException e) {
+ //should not be thrown
+
+ } catch (TimeoutException e) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException("TesseractOCRParser timeout", e);
+ }
+
+
+ }
+
+
+ /**
+ * Reads the contents of the given stream and write it to the
+ * given XHTML content handler.
+ * The stream is closed once fully processed.
+ *
+ * @param stream Stream where is the result of ocr
+ * @param xhtml XHTML content handler
+ * @throws SAXException if the XHTML SAX events could not be handled
+ * @throws IOException if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+ throws SAXException, IOException {
+
+ Reader reader = new InputStreamReader(stream, "UTF-8");
+ try {
+ xhtml.startElement("div");
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ xhtml.characters(buffer, 0, n);
+ }
+ xhtml.endElement("div");
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Starts a thread that reads the contents of the standard output
+ * or error stream of the given process to not block the process.
+ * The stream is closed once fully processed.
+ */
+ private void logStream(final String logType, final InputStream stream, final File file) {
+ new Thread() {
+ public void run() {
+ Reader reader = new InputStreamReader(stream);
+ StringBuilder out = new StringBuilder();
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+ out.append(buffer, 0, n);
+ } catch (IOException e) {
+
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+
+ String msg = out.toString();
+ //log or discard message?
+
+ }
+ }.start();
+ }
+
+
+}
+
+
Index: tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java (revision 0)
+++ tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java (working copy)
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assume.assumeTrue;
+
+public class TesseractOCRTest extends TikaTest {
+
+ private boolean canRun(TesseractOCRConfig config) {
+ String[] checkCmd = {config.getTesseractPath() + "tesseract"};
+ // If Tesseract is not on the path, do not run the test.
+ return ExternalParser.check(checkCmd);
+ }
+
+ @Test
+ public void testPDFOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ PDFParserConfig pdfConfig = new PDFParserConfig();
+ pdfConfig.setExtractInlineImages(true);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+ parseContext.set(PDFParserConfig.class, pdfConfig);
+
+ InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+ "/test-documents/testOCR.pdf");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+ assertTrue(handler.toString().contains("Happy New Year 2003!"));
+ } finally {
+ stream.close();
+ }
+ }
+
+ @Test
+ public void testDOCXOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+
+ InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+ "/test-documents/testOCR.docx");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+
+ assertTrue(handler.toString().contains("Happy New Year 2003!"));
+ assertTrue(handler.toString().contains("This is some text."));
+ assertTrue(handler.toString().contains("Here is an embedded image:"));
+ } finally {
+ stream.close();
+ }
+ }
+
+ @Test
+ public void testPPTXOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+
+ InputStream stream = TesseractOCRTest.class.getResourceAsStream(
+ "/test-documents/testOCR.pptx");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+
+ assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
+ assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+ } finally {
+ stream.close();
+ }
+
+ }
+}
Index: tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (revision 1624766)
+++ tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (working copy)
@@ -562,6 +562,8 @@
Set knownMetadataDiffs = new HashSet();
//PDFBox-1792/Tika-1203
knownMetadataDiffs.add("testAnnotations.pdf");
+ // Added for TIKA-93.
+ knownMetadataDiffs.add("testOCR.pdf");
//empty for now
Set knownContentDiffs = new HashSet();