### Eclipse Workspace Patch 1.0
#P tika-core
Index: src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
===================================================================
--- src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (revision 1027629)
+++ src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (working copy)
@@ -16,6 +16,9 @@
*/
package org.apache.tika.sax;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -73,5 +76,60 @@
assertEquals("a", words[4]);
assertEquals("b", words[5]);
}
+
+ /**
+ * Test that content in option elements are properly separated in text
+ * output.
+ *
+ * @see TIKA-394
+ */
+ public void testWhitespaceWithOptions() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("form");
+ xhtml.startElement("select");
+ xhtml.element("option", "opt1");
+ xhtml.element("option", "opt2");
+ xhtml.endElement("select");
+ xhtml.endElement("form");
+ xhtml.endDocument();
+
+ String[] words = output.toString().split("\\s+");
+ assertEquals(2, words.length);
+ assertEquals("opt1", words[0]);
+ assertEquals("opt2", words[1]);
+ }
+
+ public void testWhitespaceWithMenus() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("menu");
+ xhtml.element("li", "one");
+ xhtml.element("li", "two");
+ xhtml.endElement("menu");
+ xhtml.endDocument();
+
+ String[] words = getRealWords(output.toString());
+ assertEquals(2, words.length);
+ assertEquals("one", words[0]);
+ assertEquals("two", words[1]);
+ }
+
+ /**
+ * Return array of non-zerolength words. Splitting on whitespace will get us
+ * empty words for emptylines.
+ *
+ * @param string some mix of newlines and real words
+ * @return array of real words.
+ */
+ private static String[] getRealWords(String string) {
+ String[] possibleWords = string.split("\\s+");
+ List words = new ArrayList(possibleWords.length);
+ for (String word : possibleWords) {
+ if (word.length() > 0) {
+ words.add(word);
+ }
+ }
+
+ return words.toArray(new String[words.size()]);
+ }
}
Index: src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
===================================================================
--- src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (revision 1027629)
+++ src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (working copy)
@@ -73,7 +73,7 @@
public static final Set ENDLINE = unmodifiableSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
- "noscript", "li", "dt", "dd", "noframes", "br", "tr");
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option");
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();