Index: CHANGES.txt
===================================================================
--- CHANGES.txt	(revision 1178200)
+++ CHANGES.txt	(working copy)
@@ -5,6 +5,9 @@
  * TIKA-632: Hyperlinks in RTF documents are now extracted as an <a
    href=...>...</a> element.
 
+ * TIKA-721: Tika now tries to detect UTF16 LE and BE encoded content
+   that is missing the byte order marker.
+
 Release 0.10 - 09/25/2011
 
 The most notable changes in Tika 0.10 over previous releases are:
Index: tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java	(revision 1178200)
+++ tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java	(working copy)
@@ -215,4 +215,52 @@
         assertNotSame("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
+
+    public void testUTF16LENoBOM() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/chinese.utf16le.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("UTF-16LE", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    public void testUTF16LE() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/chinese.utf16le.bom.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("UTF-16LE", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    public void testUTF16BENoBOM() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/chinese.utf16be.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("UTF-16BE", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    public void testUTF16BE() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/chinese.utf16be.bom.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("UTF-16BE", metadata.get(Metadata.CONTENT_ENCODING));
+    }
 }
Index: tika-parsers/src/test/resources/test-documents/chinese.utf16be.txt
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: tika-parsers/src/test/resources/test-documents/chinese.utf16be.txt
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Index: tika-parsers/src/test/resources/test-documents/chinese.utf16be.bom.txt
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: tika-parsers/src/test/resources/test-documents/chinese.utf16be.bom.txt
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Index: tika-parsers/src/test/resources/test-documents/chinese.utf16le.txt
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: tika-parsers/src/test/resources/test-documents/chinese.utf16le.txt
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Index: tika-parsers/src/test/resources/test-documents/chinese.utf16le.bom.txt
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream

Property changes on: tika-parsers/src/test/resources/test-documents/chinese.utf16le.bom.txt
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Index: tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java	(revision 1178200)
+++ tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java	(working copy)
@@ -7,6 +7,9 @@
  */
 package org.apache.tika.parser.txt;
 
+import java.util.HashMap;
+import java.util.Map;
+
 /**
  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
  * BOM will be used if it is present.
@@ -40,8 +43,7 @@
                 return 100;
             }
             
-            // TODO: Do some statistics to check for unsigned UTF-16BE
-            return 0;
+            return new CharsetRecog_UTF_16_BE_NoBOM().match(det);
         }
     }
     
@@ -64,10 +66,9 @@
                    return 0;
                }
                return 100;
-            }        
-            
-            // TODO: Do some statistics to check for unsigned UTF-16LE
-            return 0;
+            }
+
+            return new CharsetRecog_UTF_16_LE_NoBOM().match(det);
         }
     }
     
@@ -151,4 +152,224 @@
             return "UTF-32LE";
         }
     }
+
+    /** Returns a measure ranging 0.0 to 1.0.  Close to 1.0
+     *  means the bytes are most likely a single-byte encoding. */
+    static double getSingleByteLikelihood(CharsetDetector det) {
+
+        // Seperately count up stats (how many times each
+        // byte value occurs) of the even and odd bytes,
+        // then compute dot-product of those two vectors.
+        // Single-byte encodings will tend to have very
+        // similar stats for even and odd bytes so the dot
+        // product will be close to 1.
+
+        final byte[] bytes = det.fRawInput;
+        if (bytes.length < 1) {
+            return 1.0;
+        }
+        final int[] counts1 = new int[256];
+        final int[] counts2 = new int[256];
+        final int end = bytes.length - (bytes.length & 2);
+        int idx = 0;
+        while (idx < end) {
+            (counts1[bytes[idx++]&0xFF])++;
+            (counts2[bytes[idx++]&0xFF])++;
+        }
+
+        // Normalize vectors to unit length
+        double sumSQ1 = 0.0;
+        double sumSQ2 = 0.0;
+
+        for(int byteValue=0;byteValue<256;byteValue++) {
+            int count = counts1[byteValue];
+            sumSQ1 += count * count;
+            count = counts2[byteValue];
+            sumSQ2 += count * count;
+        }
+
+        final double len1 = Math.sqrt(sumSQ1);
+        final double len2 = Math.sqrt(sumSQ2);
+        
+        // Dot-product:
+        double sumProd = 0.0;
+        for(int byteValue=0;byteValue<256;byteValue++) {
+            sumProd += (counts1[byteValue]/len1) * (counts2[byteValue]/len2);
+        }
+
+        return sumProd;
+    }
+
+    final static int HIGH_SURROGATE_START = 0xD800;
+    final static int HIGH_SURROGATE_END = 0xDBFF;
+    final static int LOW_SURROGATE_START = 0xDC00;
+    final static int LOW_SURROGATE_END = 0xDFFF;
+
+    static boolean isHighSurrogate(int unit) {
+        return unit >= HIGH_SURROGATE_START && unit <= HIGH_SURROGATE_END;
+    }
+
+    static boolean isLowSurrogate(int unit) {
+        return unit >= LOW_SURROGATE_START && unit <= LOW_SURROGATE_END;
+    }
+
+    /* 
+     *  Fallback: used to detect UTF16 LE/BE when there is no BOM.  We use 3 stats:
+     *  
+     *    1) Test whether encoding seems to be double-byte, by comparing distribution
+     *       of byte value counts for even and odd bytes (see getSingleByteLikelihood).
+     *
+     *    2) Compute the Unicode code points and record how many invalid vs valid
+     *       code units and characters we see.
+     *
+     *    3) Test if a high percentage  (> 60%) of the Unicode chars come from the
+     *       same block.
+     *
+     *  We heuristically combine these three stats in the end into a confidence.
+     */
+
+    static abstract class CharsetRecog_UTF_16_NoBOM extends CharsetRecog_Unicode {
+
+        // TODO: inefficient that we run this same code twice for CharsetDetector.detectAll
+
+        // index is in UTF16 units!
+        protected abstract int getCodeUnit(byte[] input, int index);
+
+        @Override
+        int match(CharsetDetector det) {
+
+            final byte[] bytes = det.fRawInput;
+            if (bytes.length < 2) {
+                return 0;
+            }
+            
+            //System.out.println("name=" + getName());
+            final double v = getSingleByteLikelihood(det);
+            //System.out.println("  singleByteLikelihood=" + v);
+            if (v > 0.70) {
+                // Short-circuit: looks likely to be single byte encoding
+                return 0;
+            }
+
+            // Most likely we are double byte encoding:
+
+            int invalidSurrogateCount = 0;
+            int surrogateCount = 0;
+            int invalidCount = 0;
+            int validCount = 0;
+            
+            final Map<String,Integer> countsByBlock = new HashMap<String,Integer>();
+            final int end = bytes.length/2;
+            int idx = 0;
+            while (idx < end) {
+                final int unit = getCodeUnit(bytes, idx);
+                int code;
+                if (isHighSurrogate(unit)) {
+                    idx++;
+                    if (idx >= end) {
+                        invalidSurrogateCount++;
+                        break;
+                    }
+
+                    final int nextUnit = getCodeUnit(bytes, idx);
+                    if (isLowSurrogate(nextUnit)) {
+                        surrogateCount++;
+                        code = 0x10000 + ((unit - HIGH_SURROGATE_START) << 10) + (nextUnit - LOW_SURROGATE_START);
+                    } else {
+                        invalidSurrogateCount++;
+                        continue;
+                    }
+                } else if (isLowSurrogate(unit)) {
+                    invalidSurrogateCount++;
+                    idx++;
+                    continue;
+                } else {
+                    code = unit;
+                }
+
+                idx++;
+
+                if (Character.isDefined(code)) {
+                    validCount++;
+                    final String block = Character.UnicodeBlock.of(code).toString();
+                    Integer cur = countsByBlock.get(block);
+                    if (cur == null) {
+                        countsByBlock.put(block, 1);
+                    } else {
+                        countsByBlock.put(block, cur+1);
+                    }
+                } else {
+                    invalidCount++;
+                }
+            }
+
+            Integer maxCount = null;
+            String maxBlock = null;
+            for(Map.Entry<String,Integer> ent: countsByBlock.entrySet()) {
+                if (maxCount == null || ent.getValue() > maxCount) {
+                    maxCount = ent.getValue();
+                    maxBlock = ent.getKey();
+                }
+            }
+
+            assert maxCount != null;
+            final double maxBlockRatio = ((float) maxCount.intValue()) / end;
+            /*
+            System.out.println("  validCount=" + validCount);
+            System.out.println("  invalidCount=" + invalidCount);
+            System.out.println("  surrogateCount=" + surrogateCount);
+            System.out.println("  invalidSurrogateCount=" + invalidSurrogateCount);
+            System.out.println("  blocks=" + countsByBlock);
+            System.out.println("  maxBlockRatio=" + maxBlockRatio);
+            */
+            final int confidence;
+            final int totalInvalidCount = invalidCount + invalidSurrogateCount;
+
+            // Cook up a confidence:
+            if (surrogateCount != 0) {
+                if (totalInvalidCount == 0) {
+                    confidence = 75;
+                } else if (validCount > totalInvalidCount * 100) {
+                    confidence = 25;
+                } else {
+                    confidence = 0;
+                }
+            } else if (validCount > totalInvalidCount * 100) {
+                if (maxBlockRatio >= 0.6) {
+                    confidence = 50;
+                } else {
+                    confidence = 25;
+                }
+            } else {
+                confidence = 0;
+            }
+
+            //System.out.println("  confidence=" + confidence);
+            return confidence;
+        }
+    }
+
+    static final class CharsetRecog_UTF_16_LE_NoBOM extends CharsetRecog_UTF_16_NoBOM {
+        @Override
+        protected int getCodeUnit(byte[] input, int index) {
+            return (input[2*index] & 0xFF) | (input[2*index + 1] & 0xFF) << 8;
+        }
+
+        @Override
+        String getName() {
+            return "UTF-16LE";
+        }
+    }
+
+    static final class CharsetRecog_UTF_16_BE_NoBOM extends CharsetRecog_UTF_16_NoBOM {
+        @Override
+        protected int getCodeUnit(byte[] input, int index) {
+            return (input[2*index] & 0xFF) << 8 | (input[2*index + 1] & 0xFF);
+        }
+
+        @Override
+        String getName() {
+            return "UTF-16BE";
+        }
+    }
 }