Index: tika-core/src/main/java/org/apache/tika/parser/ImageMetadataParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-core/src/main/java/org/apache/tika/parser/ImageMetadataParser.java	(revision )
+++ tika-core/src/main/java/org/apache/tika/parser/ImageMetadataParser.java	(revision )
@@ -0,0 +1,24 @@
+package org.apache.tika.parser;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Abstract subclass of Parser that parses image metadata.
+ * This was added to help the OCRParsers pick an image metadata parser.
+ */
+public abstract class ImageMetadataParser implements Parser {
+}
Index: tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.ImageMetadataParser
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.ImageMetadataParser	(revision )
+++ tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.ImageMetadataParser	(revision )
@@ -0,0 +1,19 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.image.ImageParser
+org.apache.tika.parser.image.PSDParser
+org.apache.tika.parser.image.TiffParser
+org.apache.tika.parser.jpeg.JpegParser
Index: tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java	(date 1414169767000)
+++ tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java	(revision )
@@ -26,14 +26,14 @@
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ImageMetadataParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class TiffParser extends AbstractParser {
+public class TiffParser extends ImageMetadataParser {
 
     /** Serial version UID */
     private static final long serialVersionUID = -3941143576535464926L;
Index: tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java	(date 1414169767000)
+++ tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java	(revision )
@@ -16,6 +16,11 @@
  */
 package org.apache.tika.parser.image;
 
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
@@ -24,19 +29,13 @@
 import java.util.Iterator;
 import java.util.Set;
 
-import javax.imageio.IIOException;
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-import javax.imageio.metadata.IIOMetadata;
-import javax.imageio.stream.ImageInputStream;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ImageMetadataParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.w3c.dom.NamedNodeMap;
@@ -44,7 +43,7 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class ImageParser extends AbstractParser {
+public class ImageParser extends ImageMetadataParser {
 
     /** Serial version UID */
     private static final long serialVersionUID = 7852529269245520335L;
@@ -62,10 +61,12 @@
                 MediaType.image("x-icon"),
                 MediaType.image("x-xcf"))));
 
+    @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
+    @Override
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
Index: tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser	(date 1414169767000)
+++ tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser	(revision )
@@ -24,12 +24,8 @@
 org.apache.tika.parser.font.AdobeFontMetricParser
 org.apache.tika.parser.font.TrueTypeParser
 org.apache.tika.parser.html.HtmlParser
-org.apache.tika.parser.image.ImageParser
-org.apache.tika.parser.image.PSDParser
-org.apache.tika.parser.image.TiffParser
 org.apache.tika.parser.iptc.IptcAnpaParser
 org.apache.tika.parser.iwork.IWorkPackageParser
-org.apache.tika.parser.jpeg.JpegParser
 org.apache.tika.parser.mail.RFC822Parser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.mbox.OutlookPSTParser
\ No newline at end of file
Index: tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java	(date 1414169767000)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java	(revision )
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.ocr;
 
+import javax.imageio.ImageIO;
 import java.awt.Graphics2D;
 import java.awt.Image;
 import java.awt.image.BufferedImage;
@@ -26,9 +27,7 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
-import java.util.ArrayList;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
@@ -37,25 +36,19 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
-import javax.imageio.ImageIO;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.external.ExternalParser;
-import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.image.PSDParser;
-import org.apache.tika.parser.image.TiffParser;
-import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.parser.image.DefaultImageMetadataParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
@@ -71,7 +64,7 @@
  * 
  * 
  */
-public class TesseractOCRParser extends AbstractParser {
+public class TesseractOCRParser extends DefaultImageMetadataParser {
 
   private static final long serialVersionUID = 1L;
 
@@ -135,22 +128,32 @@
   public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
       throws IOException, SAXException, TikaException {
 
-    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
-    if (config == null)
-      config = new TesseractOCRConfig();
+      TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+      if (config == null)
+          config = new TesseractOCRConfig();
 
-    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
+      String[] checkCmd = {config.getTesseractPath() + "tesseract"};
-    // If Tesseract is not on the path, do not try to run OCR.
+      // If Tesseract is not on the path, do not try to run OCR.
-    if (!ExternalParser.check(checkCmd))
+
+      if (!ExternalParser.check(checkCmd)) {
+          super.parse(stream, handler, metadata, context);
-      return;
+          return;
+      }
 
     XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
 
     TemporaryResources tmp = new TemporaryResources();
     File output = null;
+    InputStream metaIs = null;
     try {
       TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
       File input = tikaStream.getFile();
+      //get the metadata from the image file
+      metaIs = new FileInputStream(input);
+      super.parse(metaIs, new DefaultHandler(), metadata, context);
+      IOUtils.closeQuietly(metaIs);
+
+      //now do the OCR if appropriate
       long size = tikaStream.getLength();
 
       if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
@@ -168,6 +171,7 @@
 
     } finally {
       tmp.dispose();
+      IOUtils.closeQuietly(metaIs);
       if (output != null)
         output.delete();
 
Index: tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java	(date 1414169767000)
+++ tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java	(revision )
@@ -31,7 +31,7 @@
 import org.apache.tika.metadata.TIFF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ImageMetadataParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
@@ -43,7 +43,7 @@
  * Documentation on the file format is available from
  * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
  */
-public class PSDParser extends AbstractParser {
+public class PSDParser extends ImageMetadataParser {
 
     /** Serial version UID */
     private static final long serialVersionUID = 883387734607994914L;
@@ -52,10 +52,12 @@
         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                 MediaType.image("vnd.adobe.photoshop"))));
 
+    @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
+    @Override
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
Index: tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java	(date 1414169767000)
+++ tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java	(revision )
@@ -26,7 +26,7 @@
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ImageMetadataParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.image.ImageMetadataExtractor;
 import org.apache.tika.parser.image.xmp.JempboxExtractor;
@@ -34,7 +34,7 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-public class JpegParser extends AbstractParser {
+public class JpegParser extends ImageMetadataParser {
 
     /** Serial version UID */
     private static final long serialVersionUID = -1355028253756234603L;
@@ -42,10 +42,12 @@
     private static final Set<MediaType> SUPPORTED_TYPES =
         Collections.singleton(MediaType.image("jpeg"));
 
+    @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
+    @Override
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
Index: tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java	(date 1414169767000)
+++ tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java	(revision )
@@ -16,23 +16,34 @@
  */
 package org.apache.tika.parser.ocr;
 
+import java.io.InputStream;
+import java.util.List;
+
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.junit.Before;
 import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
 
-import java.io.InputStream;
-
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assume.assumeTrue;
 
 public class TesseractOCRParserTest extends TikaTest {
 
+    private static boolean canRun = false;
+
+    @Before
+    public void setCanRun() {
+        canRun = canRun();
+    }
+
     public static boolean canRun() {
         TesseractOCRConfig config = new TesseractOCRConfig();
         TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
@@ -47,83 +58,77 @@
 
     @Test
     public void testPDFOCR() throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
+        String resource = "/test-documents/testOCR.pdf";
+        String[] nonOCRContains = new String[0];
 
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        PDFParserConfig pdfConfig = new PDFParserConfig();
-        pdfConfig.setExtractInlineImages(true);
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
-        parseContext.set(PDFParserConfig.class, pdfConfig);
-
-        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.pdf");
-
-        try {
-            parser.parse(stream, handler, metadata, parseContext);
-            assertTrue(handler.toString().contains("Happy New Year 2003!"));
-        } finally {
-            stream.close();
+        testBasicOCR(resource, nonOCRContains, 2);
-        }
+    }
-    }
-
     @Test
     public void testDOCXOCR() throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
-
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
-
-        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.docx");
-
-        try {
-            parser.parse(stream, handler, metadata, parseContext);
-
-            assertTrue(handler.toString().contains("Happy New Year 2003!"));
-            assertTrue(handler.toString().contains("This is some text."));
-            assertTrue(handler.toString().contains("Here is an embedded image:"));
-        } finally {
-            stream.close();
+        String resource = "/test-documents/testOCR.docx";
+        String[] nonOCRContains = {
+            "This is some text.",
+            "Here is an embedded image:"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
-        }
+    }
+    @Test
+    public void testPPTXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pptx";
+        String[] nonOCRContains = {
+                "This is some text"
+        };
+        testBasicOCR(resource, nonOCRContains,3);
     }
 
     @Test
-    public void testPPTXOCR() throws Exception {
+    public void testSingleImage() throws Exception {
+        Metadata m = getXML("testGIF.gif").metadata;
+        assertTrue(m.names().length > 20);
+        assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+    }
+
+    private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
 
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
+        Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+                new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
 
+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
+
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
+        parseContext.set(Parser.class, parser);
+        parseContext.set(PDFParserConfig.class, pdfConfig);
 
         InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.pptx");
+                resource);
 
         try {
-            parser.parse(stream, handler, metadata, parseContext);
-
-            assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
-            assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+            parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
         } finally {
             stream.close();
         }
+        List<Metadata> metadataList = ((RecursiveParserWrapper)parser).getMetadata();
+        assertEquals(numMetadatas, metadataList.size());
 
+        StringBuilder contents = new StringBuilder();
+        for (Metadata m : metadataList) {
+            contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
-    }
+        }
+        if (canRun) {
+            assertTrue(contents.toString().contains("Happy New Year 2003!"));
+        }
+        for (String needle : nonOCRContains) {
+            assertContains(needle, contents.toString());
+        }
+        assertTrue(metadataList.get(0).names().length > 10);
+        assertTrue(metadataList.get(1).names().length > 10);
+        //test at least one value
+        assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+    }
+
+
 }
