Index: tika-parsers/src/test/java/org/apache/tika/TestParsers.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/TestParsers.java	(リビジョン 1005709)
+++ tika-parsers/src/test/java/org/apache/tika/TestParsers.java	(作業コピー)
@@ -66,6 +66,15 @@
         assertEquals(s1, s2);
     }
 
+    public void testRTFms932Extraction() throws Exception {
+        File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
+        String s1 = ParseUtils.getStringContent(file, tc);
+        String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
+        assertEquals(s1, s2);
+        // Hello in Japanese
+        assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
+    }
+
     public void testXMLExtraction() throws Exception {
         File file = getResourceAsFile("/test-documents/testXML.xml");
         String s1 = ParseUtils.getStringContent(file, tc);
Index: tika-parsers/src/test/resources/test-documents/testRTF-ms932.rtf
===================================================================
--- tika-parsers/src/test/resources/test-documents/testRTF-ms932.rtf	(リビジョン 0)
+++ tika-parsers/src/test/resources/test-documents/testRTF-ms932.rtf	(リビジョン 0)
@@ -0,0 +1,30 @@
+{\rtf1\ansi\ansicpg932\uc2 \deff26\deflang1033\deflangfe1041{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
+{\f23\froman\fcharset128\fprq1{\*\panose 02020609040205080304}\'82\'6c\'82\'72 \'96\'be\'92\'a9{\*\falt MS Mincho};}{\f26\froman\fcharset0\fprq2{\*\panose 02040604050505020304}Century;}
+{\f28\froman\fcharset128\fprq1{\*\panose 02020609040205080304}@\'82\'6c\'82\'72 \'96\'be\'92\'a9;}{\f37\froman\fcharset238\fprq2 Times New Roman CE;}{\f38\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f40\froman\fcharset161\fprq2 Times New Roman Greek;}
+{\f41\froman\fcharset162\fprq2 Times New Roman Tur;}{\f42\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f43\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f44\froman\fcharset186\fprq2 Times New Roman Baltic;}
+{\f223\froman\fcharset0\fprq1 MS Mincho Western{\*\falt MS Mincho};}{\f221\froman\fcharset238\fprq1 MS Mincho CE{\*\falt MS Mincho};}{\f222\froman\fcharset204\fprq1 MS Mincho Cyr{\*\falt MS Mincho};}
+{\f224\froman\fcharset161\fprq1 MS Mincho Greek{\*\falt MS Mincho};}{\f225\froman\fcharset162\fprq1 MS Mincho Tur{\*\falt MS Mincho};}{\f228\froman\fcharset186\fprq1 MS Mincho Baltic{\*\falt MS Mincho};}{\f245\froman\fcharset238\fprq2 Century CE;}
+{\f246\froman\fcharset204\fprq2 Century Cyr;}{\f248\froman\fcharset161\fprq2 Century Greek;}{\f249\froman\fcharset162\fprq2 Century Tur;}{\f252\froman\fcharset186\fprq2 Century Baltic;}
+{\f263\froman\fcharset0\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 Western;}{\f261\froman\fcharset238\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 CE;}{\f262\froman\fcharset204\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 Cyr;}
+{\f264\froman\fcharset161\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 Greek;}{\f265\froman\fcharset162\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 Tur;}{\f268\froman\fcharset186\fprq1 @\'82\'6c\'82\'72 \'96\'be\'92\'a9 Baltic;}}{\colortbl;\red0\green0\blue0;
+\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;
+\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{\qj \li0\ri0\nowidctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
+\fs21\lang1033\langfe1041\kerning2\loch\f26\hich\af26\dbch\af23\cgrid\langnp1033\langfenp1041 \snext0 Normal;}{\*\cs10 \additive Default Paragraph Font;}}{\info{\title \'83\'5e\'83\'43\'83\'67\'83\'8b}{\author shinsuke}{\operator shinsuke}
+{\creatim\yr2010\mo10\dy8\hr14\min18}{\revtim\yr2010\mo10\dy10\hr6\min59}{\version4}{\edmins3}{\nofpages1}{\nofwords3}{\nofchars3}{\nofcharsws0}{\vern8249}}\paperw11906\paperh16838\margl1701\margr1701\margt1985\margb1701\gutter0 
+\deftab840\ftnbj\aenddoc\hyphcaps0\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1701\dgvorigin1985\dghshow0\dgvshow2\jcompress\lnongrid
+\viewkind1\viewscale100\splytwnine\ftnlytwnine\htmautsp\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule {\upr{\*\fchars 
+!%),.:\'3b?]\'7d\'81\'91\'81\'8b\'81\'66\'81\'68\'81\'f1\'81\'8c\'81\'8d\'81\'8e\'81\'41\'81\'42\'81\'58\'81\'72\'81\'74\'81\'76\'81\'78\'81\'7a\'81\'6c\'81\'4a\'81\'4b\'81\'54\'81\'55\'81\'45\'81\'52\'81\'53\'81\'49\'81\'93\'81\'6a\'81\'43\'81\'44
+\'81\'46\'81\'47\'81\'48\'81\'6e\'81\'70\'a1\'a3\'a4\'a5\'de\'df\'81\'91}{\*\ud\uc0{\*\fchars 
+!%),.:\'3b?]\'7d{\uc2\u162 \'81\'91\'81\'8b\'81f\'81h\'81\'f1\'81\'8c\'81\'8d\'81\'8e\'81A\'81B\'81X\'81r\'81t\'81v\'81x\'81z\'81l\'81J\'81K\'81T\'81U\'81E\'81R\'81S\'81I\'81\'93\'81j\'81C\'81D\'81F\'81G\'81H\'81n\'81p\'a1\'a3\'a4\'a5}\'de\'df\'81\'91}}}
+{\upr{\*\lchars $([\'5c\'7b\'81\'92\'5c\'81\'65\'81\'67\'81\'71\'81\'73\'81\'75\'81\'77\'81\'79\'81\'6b\'81\'90\'81\'69\'81\'6d\'81\'6f\'a2\'81\'92\'81\'8f}{\*\ud\uc0{\*\lchars 
+$([\'5c\'7b{\uc2\u163 \'81\'92}{\uc1\u165 \'5c\'81e\'81g\'81q\'81s\'81u\'81w\'81y\'81k\'81\'90\'81i\'81m\'81o\'a2\'81\'92\'81\'8f}}}}\fet0\sectd \linex0\headery851\footery992\colsx425\endnhere\sectlinegrid360\sectspecifyl {\*\pnseclvl1
+\pnucrm\pnstart1\pnindent720\pnhang{\pntxta \dbch .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta \dbch .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta \dbch .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta \dbch )}}
+{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb \dbch (}{\pntxta \dbch )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb \dbch (}{\pntxta \dbch )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb \dbch (}{\pntxta \dbch )}}
+{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb \dbch (}{\pntxta \dbch )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb \dbch (}{\pntxta \dbch )}}\pard\plain \qj \li0\ri0\nowidctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 
+\fs21\lang1033\langfe1041\kerning2\loch\af26\hich\af26\dbch\af23\cgrid\langnp1033\langfenp1041 {\hich\af26\dbch\af23\loch\f26 Hello
+\par }{\loch\af26\hich\af26\dbch\f23 \'82\'b1\'82\'f1\'82\'c9\'82\'bf\'82\'cd}{
+\par \hich\af26\dbch\af23\loch\f26 Test
+\par }{\loch\af26\hich\af26\dbch\f23 \'83\'65\'83\'58\'83\'67}{
+\par 
+\par }{
+\par }}
\ ファイル末尾に改行がありません

属性に変更があったパス: tika-parsers/src/test/resources/test-documents/testRTF-ms932.rtf
___________________________________________________________________
追加: svn:executable
   + *

Index: tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java	(リビジョン 1005709)
+++ tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java	(作業コピー)
@@ -16,10 +16,21 @@
  */
 package org.apache.tika.parser.rtf;
 
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import javax.swing.text.AttributeSet;
 import javax.swing.text.BadLocationException;
@@ -29,6 +40,7 @@
 import javax.swing.text.rtf.RTFEditorKit;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
@@ -42,40 +54,302 @@
  */
 public class RTFParser implements Parser {
 
-    private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MediaType.application("rtf"));
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.application("rtf"));
+
+    private static final Pattern F_PATTERN = Pattern.compile("\\\\f[0-9]+");
+
+    private static final Pattern FCHARSET_PATTERN = Pattern
+            .compile("\\\\fcharset[0-9]+");
+
+    private static final Pattern ANSICPG_PATTERN = Pattern
+            .compile("\\\\ansicpg[0-9]+");
+
+    private static Map<Integer, String> FONTSET_MAP = new HashMap<Integer, String>();
+    static {
+        FONTSET_MAP.put(0, "windows-1251"); // ANSI
+        // charset 1 is Default
+        // charset 2 is Symbol
+
+        FONTSET_MAP.put(77, "MacRoman"); // Mac Roman
+        FONTSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
+        FONTSET_MAP.put(79, "ms949"); // Mac Hangul
+        FONTSET_MAP.put(80, "GB2312"); // Mac GB2312
+        FONTSET_MAP.put(81, "Big5"); // Mac Big5
+        FONTSET_MAP.put(82, "johab"); // Mac Johab (old)
+        FONTSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
+        FONTSET_MAP.put(84, "MacArabic"); // Mac Arabic
+        FONTSET_MAP.put(85, "MacGreek"); // Mac Greek
+        FONTSET_MAP.put(86, "MacTurkish"); // Mac Turkish
+        FONTSET_MAP.put(87, "MacThai"); // Mac Thai
+        FONTSET_MAP.put(88, "cp1250"); // Mac East Europe
+        FONTSET_MAP.put(89, "cp1251"); // Mac Russian
+
+        FONTSET_MAP.put(128, "MS932"); // Shift JIS
+        FONTSET_MAP.put(129, "ms949"); // Hangul
+        FONTSET_MAP.put(130, "ms1361"); // Johab
+        FONTSET_MAP.put(134, "ms936"); // GB2312
+        FONTSET_MAP.put(136, "ms950"); // Big5
+        FONTSET_MAP.put(161, "cp1253"); // Greek
+        FONTSET_MAP.put(162, "cp1254"); // Turkish
+        FONTSET_MAP.put(163, "cp1258"); // Vietnamese
+        FONTSET_MAP.put(177, "cp1255"); // Hebrew
+        FONTSET_MAP.put(178, "cp1256"); // Arabic
+        // FONTSET_MAP.put( 179, "" ); // Arabic Traditional
+        // FONTSET_MAP.put( 180, "" ); // Arabic user
+        // FONTSET_MAP.put( 181, "" ); // Hebrew user
+        FONTSET_MAP.put(186, "cp1257"); // Baltic
+
+        FONTSET_MAP.put(204, "cp1251"); // Russian
+        FONTSET_MAP.put(222, "ms874"); // Thai
+        FONTSET_MAP.put(238, "cp1250"); // Eastern European
+        FONTSET_MAP.put(254, "cp437"); // PC 437
+        FONTSET_MAP.put(255, "cp850"); // OEM
+    }
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        File tempFile = null;
+        InputStream in = null;
         try {
+            tempFile = createUnicodeRtfTempFile(stream);
+            in = new FileInputStream(tempFile);
+
             Document sd = new CustomStyledDocument();
-            new RTFEditorKit().read(stream, sd, 0);
+            new RTFEditorKit().read(in, sd, 0);
 
-            XHTMLContentHandler xhtml =
-                new XHTMLContentHandler(handler, metadata);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+                    metadata);
             xhtml.startDocument();
             xhtml.element("p", sd.getText(0, sd.getLength()));
             xhtml.endDocument();
         } catch (BadLocationException e) {
             throw new TikaException("Error parsing an RTF document", e);
+        } finally {
+            IOUtils.closeQuietly(in);
+            if (tempFile != null) {
+                tempFile.delete();
+            }
         }
     }
 
     /**
      * @deprecated This method will be removed in Apache Tika 1.0.
      */
-    public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
-            throws IOException, SAXException, TikaException {
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
         parse(stream, handler, metadata, new ParseContext());
     }
 
+    private String escapeByUnicode(String data, String enc) {
+        StringBuilder dataBuf = new StringBuilder(data.length() + 16);
+        StringBuilder keywordBuf = new StringBuilder(4);
+        StringBuilder origDataBuf = new StringBuilder();
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        for (int i = 0; i < data.length(); i++) {
+            char c1 = data.charAt(i);
+            keywordBuf.append(c1);
+            if (c1 == '\\') {
+                i++;
+                char c2 = data.charAt(i);
+                keywordBuf.append(c2);
+                if (c2 == '\'') {
+                    i++;
+                    char c3 = data.charAt(i);
+                    keywordBuf.append(c3);
+                    if ((c3 >= '0' && c3 <= '9') || (c3 >= 'a' && c3 <= 'f')
+                            || (c3 >= 'A' && c3 <= 'F')) {
+                        i++;
+                        char c4 = data.charAt(i);
+                        keywordBuf.append(c4);
+                        if ((c4 >= '0' && c4 <= '9')
+                                || (c4 >= 'a' && c4 <= 'f')
+                                || (c4 >= 'A' && c4 <= 'F')) {
+                            int value = Integer.parseInt(
+                                    String.valueOf(new char[] { c3, c4 }), 16);
+                            baos.write(value);
+                            origDataBuf.append(keywordBuf.toString());
+                            keywordBuf.delete(0, 4);
+                            continue;
+                        }
+                    }
+                }
+            }
+            if (baos.size() != 0) {
+                try {
+                    appendUnicodeStr(dataBuf, new String(baos.toByteArray(),
+                            enc));
+                } catch (UnsupportedEncodingException e) {
+                    dataBuf.append(origDataBuf.toString());
+                }
+                origDataBuf.delete(0, origDataBuf.length());
+                baos.reset();
+            }
+            dataBuf.append(keywordBuf.toString());
+            keywordBuf.delete(0, 4);
+        }
+
+        if (baos.size() != 0) {
+            try {
+                appendUnicodeStr(dataBuf, new String(baos.toByteArray(), enc));
+            } catch (UnsupportedEncodingException e) {
+                dataBuf.append(origDataBuf.toString());
+            }
+        }
+
+        return dataBuf.toString();
+    }
+
+    private void appendUnicodeStr(StringBuilder dataBuf, String value) {
+        for (int j = 0; j < value.length(); j++) {
+            char ch = value.charAt(j);
+            if (ch >= 20 && ch < 80) {
+                dataBuf.append(ch);
+            } else {
+                dataBuf.append("{\\u");
+                dataBuf.append((int) ch);
+                dataBuf.append('}');
+            }
+        }
+    }
+
+    private File createUnicodeRtfTempFile(InputStream in) throws IOException {
+        boolean isDelete = false;
+        File tempFile = null;
+        BufferedOutputStream out = null;
+        try {
+            tempFile = File.createTempFile("temp", ".rtf");
+            out = new BufferedOutputStream(new FileOutputStream(tempFile));
+
+            String defaultCharset = "windows-1251"; // ansi
+            Map<String, String> fontTableMap = new HashMap<String, String>();
+            StringBuilder dataBuf = new StringBuilder(255);
+            int ch;
+            LinkedList<String> charsetQueue = new LinkedList<String>();
+            int depth = 0;
+            String prevFt = null;
+            int prevCh = -1;
+            while ((ch = in.read()) != -1) {
+                if (ch == '{' || ch == '}' || ch == ' ') {
+                    if (charsetQueue.size() > depth + 1) {
+                        charsetQueue.removeLast();
+                    }
+
+                    String data = dataBuf.toString();
+
+                    if (depth == 1) {
+                        // check control words for a default charset
+                        String cset = loadAnsiCpg(data);
+                        if (cset != null) {
+                            defaultCharset = cset;
+                        }
+                    }
+
+                    String ft = loadFontTable(data);
+                    String charset = loadCharset(data);
+                    if (ft != null && charset != null) {
+                        fontTableMap.put(ft, charset);
+                    }
+
+                    if (ft == null && prevCh == ' ') {
+                        ft = prevFt;
+                    } else if (ft != null) {
+                        prevFt = ft;
+                    }
+
+                    // set a current charset
+                    if (charset == null && ft != null) {
+                        charset = fontTableMap.get(ft);
+                    }
+                    if (charset == null && charsetQueue.size() > 0) {
+                        charset = charsetQueue.getLast();
+                    }
+                    if (charset == null) {
+                        charset = defaultCharset;
+                    }
+
+                    // add the current charset to a queue
+                    if (charsetQueue.size() < depth + 1) {
+                        charsetQueue.add(charset);
+                    }
+
+                    String escapedStr = "windows-1251".equals(charset) ? data
+                            : escapeByUnicode(data, charset);
+                    out.write(escapedStr.getBytes("UTF-8"));
+                    out.write(ch);
+                    dataBuf.delete(0, dataBuf.length());
+
+                    prevCh = ch;
+
+                    // update a depth
+                    if (ch == '{') {
+                        depth++;
+                    } else if (ch == '}') {
+                        depth--;
+                    }
+                } else {
+                    dataBuf.append((char) ch);
+                }
+            }
+            out.flush();
+        } catch (IOException e) {
+            isDelete = true;
+            throw e;
+        } finally {
+            IOUtils.closeQuietly(out);
+            if (isDelete && tempFile != null) {
+                tempFile.delete();
+            }
+        }
+
+        return tempFile;
+    }
+
+    private String loadFontTable(String line) {
+        Matcher m = F_PATTERN.matcher(line);
+        if (m.find()) {
+            return m.group().substring(2);
+        }
+        return null;
+    }
+
+    private String loadAnsiCpg(String line) {
+        Matcher m = ANSICPG_PATTERN.matcher(line);
+        String charset = null;
+        if (m.find()) {
+            int encVal;
+            try {
+                encVal = Integer.parseInt(m.group().substring(8));
+                charset = FONTSET_MAP.get(encVal);
+            } catch (NumberFormatException e) {
+                // ignore
+            }
+        }
+
+        return charset;
+    }
+
+    private String loadCharset(String line) {
+        Matcher m = FCHARSET_PATTERN.matcher(line);
+        String charset = null;
+        if (m.find()) {
+            int encVal;
+            try {
+                encVal = Integer.parseInt(m.group().substring(9));
+            } catch (NumberFormatException e) {
+                encVal = 0;
+            }
+            charset = FONTSET_MAP.get(encVal);
+        }
+
+        return charset;
+    }
+
     /**
      * Customized version of {@link DefaultStyledDocument}. Adds whitespace
      * to places where words otherwise could have run together (see
@@ -84,21 +358,25 @@
      * <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>).
      */
     private static class CustomStyledDocument extends DefaultStyledDocument {
+        private boolean isPrevUnicode = false;
 
         public CustomStyledDocument() {
             super(new NoReclaimStyleContext());
         }
 
         @Override
-        public void insertString(
-                int offs, String str, AttributeSet a)
-        throws BadLocationException {
-            if (offs > 0 && offs == getLength()) {
+        public void insertString(int offs, String str, AttributeSet a)
+                throws BadLocationException {
+            boolean isUnicode = str.length() == 1 && str.charAt(0) > 127;
+
+            if (offs > 0 && offs == getLength() && !isPrevUnicode && !isUnicode) {
                 super.insertString(offs, " ", a);
                 super.insertString(getLength(), str, a);
             } else {
                 super.insertString(offs, str, a);
             }
+
+            isPrevUnicode = isUnicode;
         }
 
     }
