From d9b817cba6daeae7ccbe8f88d14c878d78e8646c Mon Sep 17 00:00:00 2001
From: "k.privezentsev" <k.privezentsev@corp.mail.ru>
Date: Wed, 29 May 2013 15:20:30 +0400
Subject: [PATCH] TIKA-1128 Replace line tabular by line break, when
 extracting text from Word document

---
 .../tika/parser/microsoft/WordExtractor.java       |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 11ffb7d..4640333 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -309,6 +309,9 @@ public class WordExtractor extends AbstractPOIFSExtractor {
 
        // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
 
+        // line tabulator as break line
+        text = text.replace((char)0x000b,'\n');
+
        // Non-breaking hyphens are returned as char 30
        text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
 
-- 
1.7.9.5

