Index: tika-core/src/main/java/org/apache/tika/parser/OCRParser.java
===================================================================
--- tika-core/src/main/java/org/apache/tika/parser/OCRParser.java	(revision 0)
+++ tika-core/src/main/java/org/apache/tika/parser/OCRParser.java	(working copy)
@@ -0,0 +1,10 @@
+package org.apache.tika.parser;
+
+
+/**
+ *
+ *
+ **/
+public interface OCRParser extends Parser  {
+
+}
Index: tika-parent/pom.xml
===================================================================
--- tika-parent/pom.xml	(revision 1565689)
+++ tika-parent/pom.xml	(working copy)
@@ -263,6 +263,63 @@
           <target>${maven.compile.target}</target>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.jacoco</groupId>
+        <artifactId>jacoco-maven-plugin</artifactId>
+        <version>0.6.4.201312101107</version>
+        <configuration>
+            <rules>
+              <rule implementation="org.jacoco.maven.RuleConfiguration">
+                <element>BUNDLE</element>
+                <limits>
+                  <limit implementation="org.jacoco.report.check.Limit">
+                    <counter>INSTRUCTION</counter>
+                    <value>COVEREDRATIO</value>
+                    <minimum>0.50</minimum>
+                  </limit>
+                  <!--<limit implementation="org.jacoco.report.check.Limit">
+                    <counter>CLASS</counter>
+                    <value>MISSEDCOUNT</value>
+                    <maximum>0</maximum>
+                  </limit>-->
+                </limits>
+              </rule>
+            </rules>
+            <haltOnFailure>false</haltOnFailure>
+          </configuration>
+        <executions>
+          <execution>
+            <id>default-prepare-agent</id>
+            <goals>
+              <goal>prepare-agent</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-prepare-agent-integration</id>
+            <goals>
+              <goal>prepare-agent-integration</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-report</id>
+            <goals>
+              <goal>report</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-report-integration</id>
+            <goals>
+              <goal>report-integration</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>default-check</id>
+            <goals>
+              <goal>check</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
     <pluginManagement>
       <plugins>
Index: tika-parsers/pom.xml
===================================================================
--- tika-parsers/pom.xml	(revision 1565689)
+++ tika-parsers/pom.xml	(working copy)
@@ -197,6 +197,37 @@
       </exclusions>
     </dependency>
 
+    <!-- JavaOCR deps -->
+    <dependency>
+          <groupId>net.sourceforge.javaocr</groupId>
+          <artifactId>javaocr-core</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-awt</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-morphology</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-fir</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-moment</artifactId>
+          <version>1.0</version>
+    </dependency>
+    <dependency>
+          <groupId>net.sourceforge.javaocr.plugins</groupId>
+          <artifactId>javaocr-plugin-cluster</artifactId>
+          <version>1.0</version>
+    </dependency>
     <!-- Test dependencies -->
     <dependency>
       <groupId>junit</groupId>
Index: tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/javaocr/JavaOCRParser.java	(working copy)
@@ -0,0 +1,143 @@
+package org.apache.tika.parser.javaocr;
+
+
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.CharacterRange;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.OCRScanner;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.TrainingImage;
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.TrainingImageLoader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.OCRParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A Parser that uses JavaOCR project (http://sourceforge.net/projects/javaocr/) to extract text from images
+ *
+ * After construction, you must first train the parser before using it.
+ *
+ * See http://roncemer.com/software-development/java-ocr/
+ */
+public class JavaOCRParser implements OCRParser {
+
+
+  private final static Set<MediaType> supportedTypes =
+          Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                  MediaType.image("jpeg"),
+                  MediaType.image("tiff"),
+                  MediaType.image("bmp"),
+                  MediaType.image("png")
+          )));
+  private OCRScanner scanner;
+  private CharacterRange[] acceptableChars = null;
+
+  public JavaOCRParser() {
+    scanner = new OCRScanner();//TODO: is this thing thread safe?
+  }
+
+  public JavaOCRParser(CharacterRange[] acceptableChars) {
+    scanner = new OCRScanner();
+    this.acceptableChars = acceptableChars;
+  }
+
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return supportedTypes;
+  }
+
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+    BufferedImage image = ImageIO.read(stream);
+    if (image != null) {
+      String text = scanner.scan(image, 0, 0, 0, 0, acceptableChars);
+      handler.characters(text.toCharArray(), 0, text.length());
+    } else {
+      throw new IOException("Unable to load the image from the stream");
+    }
+  }
+
+  /**
+   * Clear out the training values
+   */
+  public void clearTraining(){
+    scanner.clearTrainingImages();
+  }
+
+  /**
+   * Train based off of a single image
+   * @param image The Image to train off of
+   * @param range The {@link CharacterRange} represented in the image
+   * @param imageName The name of the image.
+   * @throws IOException if the image is not loadable
+   */
+  public void train(Image image, CharacterRange range, String imageName) throws IOException {
+    HashMap<Character, ArrayList<TrainingImage>> trainingImageMap = new HashMap<Character, ArrayList<TrainingImage>>();
+    TrainingImageLoader loader = new TrainingImageLoader();
+    loader.load(image, range, trainingImageMap, imageName);
+    scanner.addTrainingImages(trainingImageMap);
+  }
+
+  public static final Pattern CHARACTER_RANGE_PATTERN = Pattern.compile("_(\\d+)_(\\d+)\\.");
+  /**
+   * Load training images.  All files must be named with the form of:  prefix_[MIN_INT]_[MAX_INT].suffix, where
+   * MIN_INT and MAX_INT are integers representing the lower and upper bounds of the {@link CharacterRange}
+   * that the image contains training info for.  For instance, ascii_33_126.png contains
+   * training data for the character range from ! (character 33) to ~ (character 126)
+   *
+   * The File passed in must be a directory.  This method will recurse through all subdirectories looking for training
+   * data.  See the Test resources javaocr-training directory for examples of training data.
+   *
+   * @param trainingImageDir The directory from which to load the images.
+   */
+  public void train(File trainingImageDir) throws IOException {
+    if (trainingImageDir != null && trainingImageDir.isDirectory()){
+        TrainingImageLoader loader = new TrainingImageLoader();
+        HashMap<Character, ArrayList<TrainingImage>> trainingImageMap = new HashMap<Character, ArrayList<TrainingImage>>();
+      File[] files = trainingImageDir.listFiles();
+      if (files != null) {
+        for (File file : files) {
+          if (file.isDirectory()) {//recurse
+            train(file);
+          } else {
+            String name = file.getName();
+            int min = -1;
+            int max = -1;
+            Matcher matcher = CHARACTER_RANGE_PATTERN.matcher(name);
+            if (matcher.find()){
+              min = Integer.parseInt(matcher.group(1));
+              max = Integer.parseInt(matcher.group(2));
+            } else {
+              throw new IOException("Unable to find character range specification in file: " + file);
+            }
+            if (min > 0 && max > 0 && max >= min) {
+              loader.load(file.getAbsolutePath(), new CharacterRange(min, max), trainingImageMap);
+            } else {
+              //TODO: should we just log that we couldn't handle this or throw exceptioN?
+              throw new IOException("Incorrect character range specification for file: " + file);
+            }
+          }
+        }
+        scanner.addTrainingImages(trainingImageMap);
+      }
+    }
+
+  }
+}
Index: tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java	(revision 0)
+++ tika-parsers/src/test/java/org/apache/tika/parser/javaocr/JavaOCRParserTest.java	(working copy)
@@ -0,0 +1,47 @@
+package org.apache.tika.parser.javaocr;
+
+
+import net.sourceforge.javaocr.ocrPlugins.mseOCR.CharacterRange;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import javax.imageio.ImageIO;
+import java.awt.*;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ *
+ *
+ **/
+public class JavaOCRParserTest {
+  @Test
+  public void testBasics() throws Exception {
+    JavaOCRParser parser = new JavaOCRParser();
+    train(parser, "javaocr-train/ascii_33_126.png", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/hpljPica_33_126.jpg", new CharacterRange('!', '~'));
+    train(parser, "javaocr-train/digits_48_59.jpg", new CharacterRange('0', '9'));
+
+    //image = ImageIO.read();
+    StringWriter writer = new StringWriter();
+    ContentHandler handler = new WriteOutContentHandler(writer);
+    Metadata meta = new Metadata();
+    ParseContext context = new ParseContext();
+    parser.parse(JavaOCRParserTest.class.getClassLoader().getResourceAsStream("javaocr-test/asciiSentence.png"), handler, meta, context);
+    assertEquals("Happy New Year 2003!", writer.getBuffer().toString());
+  }
+
+  private void train(JavaOCRParser parser, String name, CharacterRange charRange) throws IOException {
+    InputStream resourceAsStream = JavaOCRParserTest.class.getClassLoader().getResourceAsStream(name);
+    Assert.assertNotNull(resourceAsStream);
+    Image image = ImageIO.read(resourceAsStream);
+    parser.train(image, charRange, name);
+  }
+}
Index: tika-parsers/src/test/resources/README
===================================================================
--- tika-parsers/src/test/resources/README	(revision 0)
+++ tika-parsers/src/test/resources/README	(working copy)
@@ -0,0 +1 @@
+The javaocr-train and test directories are copied from the train and test data of the JavaOCR project under the Apache license
\ No newline at end of file
Index: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/asciiSentence.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/asciiSentence.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/asciiSentence.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/asciiShuffled.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/digits.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/digits.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/digits.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/digits.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/digits.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/fieldnames.txt
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/fieldnames.txt	(revision 0)
+++ tika-parsers/src/test/resources/javaocr-test/fieldnames.txt	(working copy)
@@ -0,0 +1,57 @@
+Avoid field names which may be used by database engines, or may match SQL types
+or keywords in any give database engine.  Also note that some database engines
+require that field names begin with a letter.  The following are some examples
+of problematic field names and some better suggestions:
+    password      (use passwd instead)
+    length        (use sizeLen instead)
+    type          (use typeCode instead)
+    message       (use messageText instead)
+    year          (use yearCode instead)
+    limit         (use maximum instead)
+    date          (use systemDateStamp instead)
+    dateStamp     (use systemDateStamp instead)
+    time          (use systemTimeStamp instead)
+    timeStamp     (use systemTimeStamp instead)
+    class         (use classCode instead)
+    _1099Number   (use vendor1099Number instead)
+Since each database engine is different, you have to be especially careful when
+naming fields.  Field names which are fine on one database engine could clash
+with names of proprietary extensions in another database engine.
+
+Do not create field names which would match root field names of arrays after the
+removal of numeric subscripts.  Here's an example:
+    contactNote
+    contactNote2
+    contactNote3
+Note that contactNote2 and contactNote3 get converted to a Java array, named
+contactNote.  That name clashes with the existing contactNote field (which is
+actually a separate Java variable) and causes a compiler error when trying to
+compile the entity bean.  The correct way to do this would be:
+    contactNote1
+    contactNote2
+    contactNote3
+Also note that the same rules apply when the subscripts are embeddd within the
+names.  The following would cause the same error:
+    contactNote
+    contact1Note
+    contact2Note
+As would the following:
+    contactNote1
+    contact1Note1
+    contact1Note2
+    contact2Note1
+    contact2Note2
+
+Keep tables small, both in record size and in total number of fields.  Most
+database engines can't handle huge records or records with several hundred
+fields.  If you need more fields, create an additional table with the new
+fields and the same primary key fields as in the original table.  You can easily
+look up the record in the new table only when you need it, as opposed to loading
+the new fields every time the original table is accessed.  This will make the
+software run faster in general.
+
+After making even the SLIGHTEST change to ANY table's schema, test the entire
+system again on EVERY supported database engine.  The minimum test should be
+to create an empty set of all tables in each supported database engine and check
+for any errors while creating the tables.
+
Index: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/hpljPicaSample.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-test/shuffledDigits.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/ascii_33_126.png
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/digits_48_59.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/digits_48_59.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/digits_48_59.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/digits_48_59.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/digits_48_59.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
===================================================================
--- tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg	(revision 1565689)
+++ tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg	(working copy)

Property changes on: tika-parsers/src/test/resources/javaocr-train/hpljPica_33_126.jpg
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
