Index: NOTICE.txt
===================================================================
--- NOTICE.txt	(revision 1565689)
+++ NOTICE.txt	(working copy)
@@ -13,3 +13,5 @@
 OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
 
 IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
+
+JavaOCR: http://sourceforge.net/projects/javaocr/  ASL, Version 2.0
Index: tika-core/src/main/java/org/apache/tika/parser/OCRParser.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/parser/OCRParser.java	(revision 0)
+++ tika-core/src/main/java/org/apache/tika/parser/OCRParser.java	(working copy)
@@ -0,0 +1,28 @@
+package org.apache.tika.parser;
+
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+
+/**
+ *
+ *
+ **/
+public interface OCRParser extends Parser {
+  /**
+   * Parse from an Image as a convenience method over {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata, org.apache.tika.parser.ParseContext)}
+   *
+   * @param image    The Image to parse
+   * @param handler  The {@link org.xml.sax.ContentHandler}
+   * @param metadata The {@link org.apache.tika.metadata.Metadata}
+   * @param context  The {@link ParseContext}
+   * @throws IOException
+   * @throws SAXException
+   * @throws TikaException
+   */
+  public void parse(java.awt.Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException;
+}
Index: tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/parser/ParseContext.java	(revision 1565689)
+++ tika-core/src/main/java/org/apache/tika/parser/ParseContext.java	(working copy)
@@ -43,8 +43,25 @@
 
     /** Map of objects in this context */
     private final Map<String, Object> context = new HashMap<String, Object>();
- 
-    /**
+
+    private OCRParser ocrParser;
+
+    public ParseContext() {
+    }
+
+    public ParseContext(OCRParser ocrParser) {
+      this.ocrParser = ocrParser;
+    }
+
+    public OCRParser getOcrParser() {
+      return ocrParser;
+    }
+
+    public void setOcrParser(OCRParser ocrParser) {
+      this.ocrParser = ocrParser;
+    }
+
+  /**
      * Adds the given value to the context as an implementation of the given
      * interface.
      *
Index: tika-parent/pom.xml
===================================================================
--- tika-parent/pom.xml	(revision 1565689)
+++ tika-parent/pom.xml	(working copy)
@@ -263,6 +263,63 @@
           <target>${maven.compile.target}</target>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.jacoco</groupId>
+        <artifactId>jacoco-maven-plugin</artifactId>
+        <version>0.6.4.201312101107</version>
+        <configuration>
+            <rules>
+              <rule implementation="org.jacoco.maven.RuleConfiguration">
+                <element>BUNDLE</element>
+                <limits>
+                  <limit implementation="org.jacoco.report.check.Limit">
+                    <counter>INSTRUCTION</counter>
+                    <value>COVEREDRATIO</value>
+                    <minimum>0.50</minimum>
+                  </limit>
+                  <!--<limit implementation="org.jacoco.report.check.Limit">
+                    <counter>CLASS</counter>
+                    <value>MISSEDCOUNT</value>
+                    <maximum>0</maximum>
+                  </limit>-->
+                </limits>
+              </rule>
+            </rules>
+            <haltOnFailure>false</haltOnFailure>
+          </configuration>
+        <executions>
+          <execution>
+            <id>default-prepare-agent</id>
+            <goals>
+              <goal>prepare-agent</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-prepare-agent-integration</id>
+            <goals>
+              <goal>prepare-agent-integration</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-report</id>
+            <goals>
+              <goal>report</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-report-integration</id>
+            <goals>
+              <goal>report-integration</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-check</id>
+            <goals>
+              <goal>check</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
     <pluginManagement>
       <plugins>
Index: tika-parsers/pom.xml
===================================================================
--- tika-parsers/pom.xml	(revision 1565689)
+++ tika-parsers/pom.xml	(working copy)
@@ -197,6 +197,45 @@
       </exclusions>
     </dependency>
 
+    <!-- JavaOCR deps -->
+    <dependency>
+          <groupId>net.sourceforge.javaocr</groupId>
+          <artifactId>javaocr-core</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-awt</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-morphology</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-fir</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-moment</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-cluster</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <!-- Picks up new image file types for ImageIO
+    https://github.com/stain/jai-imageio-core
+    -->
+    <dependency>
+      <groupId>net.java.dev.jai-imageio</groupId>
+      <artifactId>jai-imageio-core-standalone</artifactId>
+      <version>1.2-pre-dr-b04-2013-04-23</version>
+    </dependency>
     <!-- Test dependencies -->
     <dependency>
       <groupId>junit</groupId>
@@ -298,4 +337,15 @@
   	<system>Jenkins</system>
   	<url>https://builds.apache.org/job/Tika-trunk/</url>
   </ciManagement>
+  <repositories>
+    <repository>
+        <releases />
+        <snapshots>
+            <enabled>false</enabled>
+        </snapshots>
+        <id>mygrid-repository</id>
+        <name>myGrid Repository</name>
+        <url>http://www.mygrid.org.uk/maven/repository</url>
+    </repository>
+</repositories>
 </project>
Index: tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java	(working copy)
@@ -0,0 +1,154 @@
+package org.apache.tika.parser.javaocr;
+
+
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.CharacterRange;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.OCRScanner;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.TrainingImage;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.TrainingImageLoader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.OCRParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A Parser that uses JavaOCR project (http://sourceforge.net/projects/javaocr/) to extract text from images
+ * <p/>
+ * After construction, you must first train the parser before using it.
+ * <p/>
+ * See http://roncemer.com/software-development/java-ocr/
+ */
+public class JavaOCRParser implements OCRParser {
+
+
+  private final static Set<MediaType> supportedTypes =
+          Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                  MediaType.image("jpeg"),
+                  MediaType.image("tiff"),
+                  MediaType.image("bmp"),
+                  MediaType.image("png")
+          )));
+  private OCRScanner scanner;
+  private CharacterRange[] acceptableChars = null;
+
+  public JavaOCRParser() {
+    this(null);
+  }
+
+  public JavaOCRParser(CharacterRange[] acceptableChars) {
+    scanner = new OCRScanner();
+    this.acceptableChars = acceptableChars;
+    ImageIO.scanForPlugins();//see if we can load up any additional file types
+  }
+
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return supportedTypes;
+  }
+
+  @Override
+  public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+    if (image != null) {
+      String text = scanner.scan(image, 0, 0, 0, 0, acceptableChars);
+      if (text != null && text.isEmpty() == false) {
+        handler.characters(text.toCharArray(), 0, text.length());
+      }
+    } else {
+      throw new IOException("Unable to load the image from the stream");
+    }
+  }
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+    parse(ImageIO.read(stream), handler, metadata, context);
+  }
+
+  /**
+   * Clear out the training values
+   */
+  public void clearTraining() {
+    scanner.clearTrainingImages();
+  }
+
+  /**
+   * Train based off of a single image
+   *
+   * @param image     The Image to train off of
+   * @param range     The {@link CharacterRange} represented in the image
+   * @param imageName The name of the image.
+   * @throws IOException if the image is not loadable
+   */
+  public void train(Image image, CharacterRange range, String imageName) throws IOException {
+    HashMap<Character, ArrayList<TrainingImage>> trainingImageMap = new HashMap<Character, ArrayList<TrainingImage>>();
+    TrainingImageLoader loader = new TrainingImageLoader();
+    loader.load(image, range, trainingImageMap, imageName);
+    scanner.addTrainingImages(trainingImageMap);
+  }
+
+  public static final Pattern CHARACTER_RANGE_PATTERN = Pattern.compile("_(\\-?\\d+)_(\\-?\\d+)\\.");
+
+  /**
+   * Load training images.  All files must be named with the form of:  prefix_[MIN_INT]_[MAX_INT].suffix, where
+   * MIN_INT and MAX_INT are integers representing the lower and upper bounds of the {@link CharacterRange}
+   * that the image contains training info for.  For instance, ascii_33_126.png contains
+   * training data for the character range from ! (character 33) to ~ (character 126)
+   * <p/>
+   * The File passed in must be a directory.  This method will recurse through all subdirectories looking for training
+   * data.  See the Test resources javaocr-training directory for examples of training data.
+   *
+   * @param trainingImageDir The directory from which to load the images.
+   * @return The number of files trained
+   */
+  public int train(File trainingImageDir) throws IOException {
+    int result = 0;
+    if (trainingImageDir != null && trainingImageDir.isDirectory()) {
+      TrainingImageLoader loader = new TrainingImageLoader();
+      HashMap<Character, ArrayList<TrainingImage>> trainingImageMap = new HashMap<Character, ArrayList<TrainingImage>>();
+      File[] files = trainingImageDir.listFiles();
+      if (files != null) {
+        for (File file : files) {
+          if (file.isDirectory()) {//recurse
+            result += train(file);
+          } else {
+            String name = file.getName();
+            int min = -1;
+            int max = -1;
+            Matcher matcher = CHARACTER_RANGE_PATTERN.matcher(name);
+            if (matcher.find()) {
+              min = Integer.parseInt(matcher.group(1));
+              max = Integer.parseInt(matcher.group(2));
+            } else {
+              throw new IOException("Unable to find character range specification in file: " + file);
+            }
+            if (min >= 0 && max >= 0 && max >= min) {
+              loader.load(file.getAbsolutePath(), new CharacterRange(min, max), trainingImageMap);
+              result++;
+            } else {
+              //TODO: should we just log that we couldn't handle this or throw exceptioN?
+              throw new IOException("Incorrect character range specification for file: " + file);
+            }
+          }
+        }
+        scanner.addTrainingImages(trainingImageMap);
+      }
+    }
+    return result;
+  }
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java	(revision 1565689)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java	(working copy)
@@ -26,15 +26,20 @@
 import java.util.Map;
 import java.util.TreeMap;
 
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.OCRScanner;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
 import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
+import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
@@ -54,6 +59,7 @@
 import org.apache.tika.io.IOExceptionWithCause;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.OCRParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -128,8 +134,10 @@
     private final ParseContext context;
     private final XHTMLContentHandler handler;
     private final PDFParserConfig config;
+    private final Metadata metadata;
+    private final OCRParser ocrParser;
     
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, 
+    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
             PDFParserConfig config)
             throws IOException {
         //source of config (derives from context or PDFParser?) is
@@ -138,6 +146,8 @@
         this.originalHandler = handler;
         this.context = context;
         this.handler = new XHTMLContentHandler(handler, metadata);
+        this.metadata = metadata;
+        this.ocrParser = context.getOcrParser();
         setForceParsing(true);
         setSortByPosition(config.getSortByPosition());
         if (config.getEnableAutoSpace()) {
@@ -209,9 +219,60 @@
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to start a page", e);
         }
-        writeParagraphStart();
+        if (ocrParser != null && config.getUseOcr()) {//if we have OCR, then try to parse, but wrap it in a div
+          try {
+            handler.startElement("div", "class", "ocr");
+          } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start OCR", e);
+          }
+          processResources(page.getResources());
+          try {
+            handler.endElement("div");
+          } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end OCR", e);
+          }
+        }
+      writeParagraphStart();
     }
 
+    private void processResources(PDResources resources) throws IOException{
+      if (resources == null) {
+        return;
+      }
+      Map<String, PDXObject> xobjects = resources.getXObjects();
+      if (xobjects != null) {
+        Iterator<String> xobjectIter = xobjects.keySet().iterator();
+        while (xobjectIter.hasNext()) {
+          String key = xobjectIter.next();
+          PDXObject xobject = xobjects.get(key);
+          // write the images
+          if (xobject instanceof PDXObjectImage) {
+            PDXObjectImage image = (PDXObjectImage) xobject;
+            String name = null;
+            if (image != null) {
+              try {
+                ocrParser.parse(image.getRGBImage(), handler, metadata, context);
+              } catch (SAXException e) {
+                throw new IOException(e);
+              } catch (TikaException e) {
+                throw new IOException(e);
+              }
+            } else {
+              //TODO: nocommit
+            }
+
+          }
+          // maybe there are more images embedded in a form object
+          else if (xobject instanceof PDXObjectForm) {
+            PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
+            PDResources formResources = xObjectForm.getResources();
+            processResources(formResources);
+          }
+        }
+      }
+    }
+
+
     @Override
     protected void endPage(PDPage page) throws IOException {
 
Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java	(revision 1565689)
+++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java	(working copy)
@@ -63,6 +63,8 @@
     //True if acroform content should be extracted
     private boolean extractAcroFormContent = true;
 
+    private boolean useOcr = false;
+
     public PDFParserConfig(){
         init(this.getClass().getResourceAsStream("PDFParser.properties"));
     }
@@ -114,6 +116,9 @@
         setExtractAcroFormContent(
                 getProp(props.getProperty("extractAcroFormContent"),
                 getExtractAcroFormContent()));
+        setUseOcr(getProp(props.getProperty("useOcr"),
+                getUseOcr()));
+
     }
 
     
@@ -215,7 +220,15 @@
         this.useNonSequentialParser = useNonSequentialParser;
     }
 
-    private boolean getProp(String p, boolean defaultMissing){
+    public boolean getUseOcr() {
+      return useOcr;
+    }
+
+    public void setUseOcr(boolean useOcr) {
+      this.useOcr = useOcr;
+    }
+
+  private boolean getProp(String p, boolean defaultMissing){
         if (p == null){
             return defaultMissing;
         }
Index: tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java	(revision 0)
+++ tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java	(working copy)
@@ -0,0 +1,188 @@
+package org.apache.tika.parser.javaocr;
+
+
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.CharacterRange;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.NullInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.pdf.PDFParserTest;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ *
+ *
+ **/
+public class JavaOCRParserTest {
+  @Test
+  public void testBasics() throws Exception {
+    JavaOCRParser parser = new JavaOCRParser();
+    train(parser, "javaocr-train/ascii_33_126.png", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/hpljPica_33_126.jpg", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/digits_48_57.jpg", new CharacterRange('0', '9'));
+
+    //image = ImageIO.read();
+    StringWriter writer = new StringWriter();
+    ContentHandler handler;
+    handler = new WriteOutContentHandler(writer);
+    Metadata meta = new Metadata();
+    ParseContext context = new ParseContext();
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiSentence.png"), handler, meta, context);
+    assertEquals("Happy New Year 2003!", writer.getBuffer().toString());
+    writer = new StringWriter();
+    handler = new WriteOutContentHandler(writer);
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiShuffled.png"), handler, meta, context);
+    assertEquals("!\"#$%&' ()*+,- wxyz{|}~\n" +
+            "./0123456789:;<=>?@\n" +
+            "abcdefghijklmnopqrstuv\n" +
+            "PQRSTUVWXYZ[\\]^_`\n" +
+            "ABCDEFGHIJKLMNO", writer.getBuffer().toString());
+    //Test that we can't get anything w/o training data
+    parser.clearTraining();
+    writer = new StringWriter();
+    handler = new WriteOutContentHandler(writer);
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiShuffled.png"), handler, meta, context);
+    assertEquals("", writer.getBuffer().toString().trim());
+    //Bad Input
+    writer = new StringWriter();
+    handler = new WriteOutContentHandler(writer);
+    try {
+      parser.parse(new NullInputStream(10), handler, meta, context);
+      Assert.fail();
+    } catch (IOException e) {
+      //expected
+    } catch (SAXException e) {
+      Assert.fail();
+    } catch (TikaException e) {
+      Assert.fail();
+    }
+    CharacterRange[] range = new CharacterRange[1];
+    range[0] = new CharacterRange(65, 73);
+    parser = new JavaOCRParser(range);
+    train(parser, "javaocr-train/ascii_33_126.png", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/hpljPica_33_126.jpg", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/digits_48_57.jpg", new CharacterRange('0', '9'));
+    writer = new StringWriter();
+    handler = new WriteOutContentHandler(writer);
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiSentence.png"), handler, meta, context);
+    assertEquals("HBDDF HCA ICBF CDDI", writer.getBuffer().toString());// not sure why this is, but it is what is returned
+
+  }
+
+  @Test
+  public void testOCR() throws Exception {
+    PDFParser tikaParser = new PDFParser();
+    tikaParser.getPDFParserConfig().setUseOcr(true);
+    StringWriter writer = new StringWriter();
+    ContentHandler handler = new WriteOutContentHandler(writer);
+    Metadata metadata = new Metadata();
+    ParseContext context = new ParseContext();
+    JavaOCRParser parser = new JavaOCRParser();
+    train(parser, "javaocr-train/ascii_33_126.png", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/hpljPica_33_126.jpg", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/digits_48_57.jpg", new CharacterRange('0', '9'));
+    context.setOcrParser(parser);
+
+    InputStream stream = PDFParserTest.class.getResourceAsStream(
+            "/test-documents/testOCR.pdf");
+    try {
+      tikaParser.parse(stream, handler, metadata, context);
+      assertEquals("Presentation1\n" + "\n" + "Happy New Year 2003!", writer.getBuffer().toString().trim());
+    } finally {
+      stream.close();
+    }
+
+  }
+
+  @Test
+  public void testTrainingDir() throws Exception {
+    JavaOCRParser parser = new JavaOCRParser();
+    int count = parser.train(null);
+    assertEquals(0, count);
+    File theTemp = File.createTempFile("not_a_dir", ".txt");
+    IOUtils.write("foo", new FileWriter(theTemp));
+    count = parser.train(theTemp);
+    assertEquals(0, count);
+    File tempDir = File.createTempFile("tika-test-javaocr", "");
+    tempDir.delete();
+    tempDir.mkdirs();
+    count = parser.train(theTemp);//empty dir
+    assertEquals(0, count);
+    copyResource(tempDir, "javaocr-train/ascii_33_126.png", "ascii_33_126.png");
+    copyResource(tempDir, "javaocr-train/hpljPica_33_126.jpg", "hpljPica_33_126.jpg");
+    File recurseDir = new File(tempDir, "recurse");
+    recurseDir.mkdirs();
+    copyResource(recurseDir, "javaocr-train/digits_48_57.jpg", "digits_48_57.jpg");
+    count = parser.train(tempDir);
+    assertEquals(3, count);
+    StringWriter writer = new StringWriter();
+    ContentHandler handler;
+    handler = new WriteOutContentHandler(writer);
+    Metadata meta = new Metadata();
+    ParseContext context = new ParseContext();
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiSentence.png"), handler, meta, context);
+    assertEquals("Happy New Year 2003!", writer.getBuffer().toString());
+    //Create a bad file
+    checkBadFile(parser, tempDir, "foo_1.png");//no max specified
+    checkBadFile(parser, tempDir, "foo_10_2.png"); //min and max flipped
+    checkBadFile(parser, tempDir, "foo.png");// no spec
+    checkBadFile(parser, tempDir, "foo_-1_-1.png");//less than 0
+    checkBadFile(parser, tempDir, "foo_1_-1.png");//less than 0
+  }
+
+  @Test
+  public void testBadImageTraining() throws Exception {
+    JavaOCRParser parser = new JavaOCRParser();
+    try {
+      parser.train(new java.awt.image.BufferedImage(10, 10, java.awt.image.BufferedImage.TYPE_INT_RGB), new CharacterRange(0, 10), "bad");
+      Assert.fail();
+    } catch (IOException e) {
+
+    }
+  }
+
+  private void checkBadFile(JavaOCRParser parser, File tempDir, String fileName) throws IOException {
+    File file = new File(tempDir, fileName);
+    IOUtils.write("This is junk", new FileWriter(file));
+    parser.clearTraining();
+    try {
+      parser.train(tempDir);
+      Assert.fail();
+    } catch (IOException e) {
+      //expected
+    }
+    file.delete();
+  }
+
+  private void copyResource(File tempDir, String input, String output) throws IOException {
+    FileOutputStream outStream = new FileOutputStream(new File(tempDir, output));
+    IOUtils.copy(JavaOCRParserTest.class.getClassLoader().getResourceAsStream(input), outStream);
+  }
+
+
+  private void train(JavaOCRParser parser, String name, CharacterRange charRange) throws IOException {
+    InputStream resourceAsStream = JavaOCRParserTest.class.getClassLoader().getResourceAsStream(name);
+    Assert.assertNotNull(resourceAsStream);
+    Image image = ImageIO.read(resourceAsStream);
+    parser.train(image, charRange, name);
+  }
+}
Index: tika-parsers/src/test/resources/README
===================================================================
--- tika-parsers/src/test/resources/README	(revision 0)
+++ tika-parsers/src/test/resources/README	(working copy)
@@ -0,0 +1 @@
+The javaocr-train and test directories are copied from the train and test data of the JavaOCR project under the Apache license
\ No newline at end of file
Index: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/asciiSentence.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/asciiSentence.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/digits.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/digits.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/digits.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/digits.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/digits.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/fieldnames.txt
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/fieldnames.txt	(revision 0)
+++ tika-parsers/src/test/resources/javaocr-test/fieldnames.txt	(working copy)
@@ -0,0 +1,57 @@
+Avoid field names which may be used by database engines, or may match SQL types
+or keywords in any give database engine.  Also note that some database engines
+require that field names begin with a letter.  The following are some examples
+of problematic field names and some better suggestions:
+    password      (use passwd instead)
+    length        (use sizeLen instead)
+    type          (use typeCode instead)
+    message       (use messageText instead)
+    year          (use yearCode instead)
+    limit         (use maximum instead)
+    date          (use systemDateStamp instead)
+    dateStamp     (use systemDateStamp instead)
+    time          (use systemTimeStamp instead)
+    timeStamp     (use systemTimeStamp instead)
+    class         (use classCode instead)
+    _1099Number   (use vendor1099Number instead)
+Since each database engine is different, you have to be especially careful when
+naming fields.  Field names which are fine on one database engine could clash
+with names of proprietary extensions in another database engine.
+
+Do not create field names which would match root field names of arrays after the
+removal of numeric subscripts.  Here's an example:
+    contactNote
+    contactNote2
+    contactNote3
+Note that contactNote2 and contactNote3 get converted to a Java array, named
+contactNote.  That name clashes with the existing contactNote field (which is
+actually a separate Java variable) and causes a compiler error when trying to
+compile the entity bean.  The correct way to do this would be:
+    contactNote1
+    contactNote2
+    contactNote3
+Also note that the same rules apply when the subscripts are embeddd within the
+names.  The following would cause the same error:
+    contactNote
+    contact1Note
+    contact2Note
+As would the following:
+    contactNote1
+    contact1Note1
+    contact1Note2
+    contact2Note1
+    contact2Note2
+
+Keep tables small, both in record size and in total number of fields.  Most
+database engines can't handle huge records or records with several hundred
+fields.  If you need more fields, create an additional table with the new
+fields and the same primary key fields as in the original table.  You can easily
+look up the record in the new table only when you need it, as opposed to loading
+the new fields every time the original table is accessed.  This will make the
+software run faster in general.
+
+After making even the SLIGHTEST change to ANY table's schema, test the entire
+system again on EVERY supported database engine.  The minimum test should be
+to create an empty set of all tables in each supported database engine and check
+for any errors while creating the tables.
+
Index: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/digits_48_57.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/digits_48_57.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/digits_48_57.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/digits_48_57.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/digits_48_57.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/test-documents/testOCR.pdf
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/test-documents/testOCR.pdf
===================================================================
--- tika-parsers/src/test/resources/test-documents/testOCR.pdf	(revision 1565689)
+++ tika-parsers/src/test/resources/test-documents/testOCR.pdf	(working copy)

Property changes on: tika-parsers/src/test/resources/test-documents/testOCR.pdf
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property