From 2244f29f9cd69e36140802a4c33d110a7e627d90 Mon Sep 17 00:00:00 2001
From: Jukka Zitting <jukka@apache.org>
Date: Wed, 1 Aug 2012 13:04:01 +0200
Subject: [PATCH] TIKA-965: Text Detection Fails on Mostly Non-ASCII UTF-8
 Files

Add simple UTF-8 detection heuristics based on the computed byte histogram
---
 .../java/org/apache/tika/detect/TextDetector.java  |    2 +-
 .../org/apache/tika/detect/TextStatistics.java     |   28 ++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
index 09d3af0..50002e0 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
@@ -127,7 +127,7 @@ public class TextDetector implements Detector {
                 m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
             }
 
-            if (stats.isMostlyAscii()) {
+            if (stats.isMostlyAscii() || stats.looksLikeUTF8()) {
                 return MediaType.TEXT_PLAIN;
             } else {
                 return MediaType.OCTET_STREAM;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
index 581a133..dc084ec 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -53,6 +53,34 @@ public class TextStatistics {
     }
 
     /**
+     * Checks whether the observed byte stream looks like UTF-8 encoded text.
+     *
+     * @since Apache Tika 1.3
+     * @return <code>true</code> if the seen bytes look like UTF-8,
+     *         <code>false</code> otherwise
+     */
+    public boolean looksLikeUTF8() {
+        int control = count(0, 0x20);
+        int utf8 = count(0x20, 0x80);
+        int safe = countSafeControl();
+
+        int expectedContinuation = 0;
+        int[] leading = new int[] {
+                count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8) };
+        for (int i = 0; i < leading.length; i++) {
+            utf8 += leading[i];
+            expectedContinuation += (i + 1) * leading[i];
+        }
+
+        int continuation = count(0x80, 0xc0);
+        return utf8 > 0
+                && continuation <= expectedContinuation
+                && continuation >= expectedContinuation - 3
+                && count(0xf80, 0x100) == 0
+                && (control - safe) * 100 < utf8 * 2;
+    }
+
+    /**
      * Returns the total number of bytes seen so far.
      *
      * @return count of all bytes
-- 
1.7.10.msysgit.1

