### Eclipse Workspace Patch 1.0
#P tika-parsers
Index: src/test/resources/test-documents/boilerplate.html
===================================================================
--- src/test/resources/test-documents/boilerplate.html	(revision 1040836)
+++ src/test/resources/test-documents/boilerplate.html	(working copy)
@@ -35,7 +35,7 @@
 This is the end of the text.
 </p>
 
-<p><a href="footer.html">footer</a></p>
+<p><a href="Footer.html">footer</a></p>
 
 </body>
 </html>
Index: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java	(revision 1040836)
+++ src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java	(working copy)
@@ -17,6 +17,9 @@
 package org.apache.tika.parser.html;
 
 import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.WriteOutContentHandler;
@@ -43,6 +46,71 @@
  */
 public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
 
+    private static class RecordedElement {
+        public enum ElementType {
+            START,
+            END,
+            CONTINUE
+        }
+        
+        private String uri;
+        private String localName;
+        private String qName;
+        private Attributes attrs;
+        private List<char[]> characters;
+        private ElementType elementType;
+
+        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+            this(uri, localName, qName, attrs, ElementType.START);
+        }
+        
+        public RecordedElement(String uri, String localName, String qName) {
+            this(uri, localName, qName, null, ElementType.END);
+        }
+        
+        public RecordedElement() {
+            this(null, null, null, null, ElementType.CONTINUE);
+        }
+        
+        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+            this.uri = uri;
+            this.localName = localName;
+            this.qName = qName;
+            this.attrs = attrs;
+            this.elementType = elementType;
+            this.characters = new ArrayList<char[]>();
+        }
+
+        @Override
+        public String toString() {
+            return String.format("<%s> of type %s", localName, elementType);
+        };
+        
+        public String getUri() {
+            return uri;
+        }
+
+        public String getLocalName() {
+            return localName;
+        }
+
+        public String getQName() {
+            return qName;
+        }
+
+        public Attributes getAttrs() {
+            return attrs;
+        }
+
+        public List<char[]> getCharacters() {
+            return characters;
+        }
+        
+        public RecordedElement.ElementType getElementType() {
+            return elementType;
+        }
+    }
+    
     /**
      * The newline character that gets inserted after block elements.
      */
@@ -50,7 +118,13 @@
 
     private ContentHandler delegate;
     private BoilerpipeExtractor extractor;
-
+    
+    private boolean includeMarkup;
+    private boolean inHeader;
+    private boolean inFooter;
+    private int headerCharOffset;
+    private List<RecordedElement> elements;
+    
     /**
      * Creates a new boilerpipe-based content extractor, using the
      * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
@@ -87,6 +161,87 @@
         this.extractor = extractor;
     }
 
+    public void setIncludeMarkup(boolean includeMarkup) {
+        this.includeMarkup = includeMarkup;
+    }
+    
+    public boolean isIncludeMarkup() {
+        return includeMarkup;
+    }
+    
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+        
+        delegate.startDocument();
+        
+        inHeader = true;
+        inFooter = false;
+        headerCharOffset = 0;
+        
+        if (includeMarkup) {
+            elements = new ArrayList<RecordedElement>();
+        }
+    };
+    
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        super.startPrefixMapping(prefix, uri);
+        delegate.startPrefixMapping(prefix, uri);
+    };
+    
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+        
+        if (inHeader) {
+            delegate.startElement(uri, localName, qName, atts);
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            elements.add(new RecordedElement(uri, localName, qName, atts));
+        } else {
+            // This happens for the <body> element, if we're not doing markup.
+            delegate.startElement(uri, localName, qName, atts);
+        }
+    };
+    
+    @Override
+    public void characters(char[] chars, int offset, int length) throws SAXException {
+        super.characters(chars, offset, length);
+        
+        if (inHeader) {
+            delegate.characters(chars, offset, length);
+            headerCharOffset++;
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            RecordedElement element = elements.get(elements.size() - 1);
+            
+            char[] characters = new char[length];
+            System.arraycopy(chars, offset, characters, 0, length);
+            element.getCharacters().add(characters);
+        }
+    };
+    
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        super.endElement(uri, localName, qName);
+        
+        if (inHeader) {
+            delegate.endElement(uri, localName, qName);
+            inHeader = !localName.equals("head");
+        } else if (inFooter) {
+            // Do nothing
+        } else if (localName.equals("body")) {
+            inFooter = true;
+        } else if (includeMarkup) {
+            // Add the end element, and the continuation from the previous element
+            elements.add(new RecordedElement(uri, localName, qName));
+            elements.add(new RecordedElement());
+        }
+    };
+    
     @Override
     public void endDocument() throws SAXException {
         super.endDocument();
@@ -100,37 +255,68 @@
         
         Attributes emptyAttrs = new AttributesImpl();
 
-        delegate.startDocument();
-        delegate.startPrefixMapping("", XHTMLContentHandler.XHTML);
-
-        delegate.startElement(XHTMLContentHandler.XHTML, "html", "html", emptyAttrs);
-        delegate.startElement(XHTMLContentHandler.XHTML, "head", "head", emptyAttrs);
-        delegate.startElement(XHTMLContentHandler.XHTML, "title", "title", emptyAttrs);
-        
-        if (td.getTitle() != null) {
-            char[] titleChars = td.getTitle().toCharArray();
-            delegate.characters(titleChars, 0, titleChars.length);
-            delegate.ignorableWhitespace(NL, 0, NL.length);
-        }
-        
-        delegate.endElement(XHTMLContentHandler.XHTML, "title", "title");
-        delegate.endElement(XHTMLContentHandler.XHTML, "head", "head");
-        
-        delegate.startElement(XHTMLContentHandler.XHTML, "body", "body", emptyAttrs);
-
-        for (TextBlock block : td.getTextBlocks()) {
-            if (block.isContent()) {
-                delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
-                char[] chars = block.getText().toCharArray();
-                delegate.characters(chars, 0, chars.length);
-                delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
-                delegate.ignorableWhitespace(NL, 0, NL.length);
+        // At this point we have all the information we need to either emit N paragraphs
+        // of plain text (if not including markup), or we have to replay our recorded elements
+        // and only emit character runs that passed the boilerpipe filters.
+        if (includeMarkup) {
+            BitSet validCharacterRuns = new BitSet();
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    BitSet bs = block.getContainedTextElements();
+                    if (bs != null) {
+                        validCharacterRuns.or(bs);
+                    }
+                }
+            }
+            
+            // Now have bits set for all valid character runs. Replay our recorded elements,
+            // but only emit character runs flagged as valid.
+            int curCharsIndex = headerCharOffset;
+            for (RecordedElement element : elements) {
+                switch (element.getElementType()) {
+                    case START:
+                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+                        // Fall through
+                        
+                    case CONTINUE:
+                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+                        // we have to follow suit.
+                        for (char[] chars : element.getCharacters()) {
+                            curCharsIndex++;
+                            
+                            if (validCharacterRuns.get(curCharsIndex)) {
+                                delegate.characters(chars, 0, chars.length);
+                            }
+                        }
+                        break;
+                        
+                    case END:
+                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+                        break;
+                        
+                    default:
+                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
+                }
+                
+                
+            }
+        } else {
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+                    char[] chars = block.getText().toCharArray();
+                    delegate.characters(chars, 0, chars.length);
+                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                }
             }
         }
         
         delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
         delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
         
+        // We defer ending any prefix mapping until here, which is why we don't pass this
+        // through to the delegate in an overridden method.
         delegate.endPrefixMapping("");
 
         delegate.endDocument();
Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java	(revision 1040836)
+++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java	(working copy)
@@ -39,7 +39,6 @@
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -394,13 +393,13 @@
         String path = "/test-documents/boilerplate.html";
         
         Metadata metadata = new Metadata();
-        WriteOutContentHandler handler = new WriteOutContentHandler();
+        BodyContentHandler handler = new BodyContentHandler();
         new HtmlParser().parse(
                 HtmlParserTest.class.getResourceAsStream(path),
                 new BoilerpipeContentHandler(handler),  metadata, new ParseContext());
         
         String content = handler.toString();
-        assertTrue(content.startsWith("Title\nThis is the real meat"));
+        assertTrue(content.startsWith("This is the real meat"));
         assertTrue(content.endsWith("This is the end of the text.\n"));
         assertFalse(content.contains("boilerplate"));
         assertFalse(content.contains("footer"));
@@ -681,4 +680,33 @@
         handler.setResult(new StreamResult(writer));
         return handler;
     }
+    
+    /**
+     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
+     */
+    public void testBoilerplateWithMarkup() throws Exception {
+        String path = "/test-documents/boilerplate.html";
+        
+        Metadata metadata = new Metadata();
+        StringWriter sw = new StringWriter();
+        ContentHandler ch = makeHtmlTransformer(sw);
+        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
+        bpch.setIncludeMarkup(true);
+        
+        new HtmlParser().parse(
+                HtmlParserTest.class.getResourceAsStream(path),
+                bpch,  metadata, new ParseContext());
+        
+        String content = sw.toString();
+        assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
+        assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
+        assertTrue("Has real content", content.contains("<p>This is the real meat"));
+        assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
+        assertFalse(content.contains("boilerplate"));
+        assertFalse(content.contains("footer"));
+    }
+    
+    
+
 }
