Index: build.xml
===================================================================
--- build.xml	(revision 1567824)
+++ build.xml	(working copy)
@@ -919,6 +919,7 @@
         <source path="${basedir}/src/plugin/feed/src/java/" />
         <source path="${basedir}/src/plugin/feed/src/test/" />
         <source path="${basedir}/src/plugin/headings/src/java/" />
+		<source path="${basedir}/src/plugin/headings/src/test/" />
         <source path="${basedir}/src/plugin/index-anchor/src/java/" />
         <source path="${basedir}/src/plugin/index-anchor/src/test/" />
         <source path="${basedir}/src/plugin/index-basic/src/java/" />
Index: src/plugin/headings/sample/testHeaders.html
===================================================================
--- src/plugin/headings/sample/testHeaders.html	(revision 0)
+++ src/plugin/headings/sample/testHeaders.html	(revision 0)
@@ -0,0 +1,8 @@
+<html>
+<head>
+</head>
+<body>
+<h1>This is a test head h1</h1>
+<h2><span>This is a test head h2</span></h2>
+</body>
+
Index: src/plugin/headings/sample/testMultivalueHeaders.html
===================================================================
--- src/plugin/headings/sample/testMultivalueHeaders.html	(revision 0)
+++ src/plugin/headings/sample/testMultivalueHeaders.html	(revision 0)
@@ -0,0 +1,10 @@
+<html>
+<head>
+</head>
+<body>
+<h1>Test header h1</h1>
+<h1>Test header h1 too</h1>
+<h2>Test header h2</h2>
+<h2>Test header h2 too</h2>
+<h2><p><p>Test header h2 with span</p></p></h2>
+</body>
\ No newline at end of file
Index: src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
===================================================================
--- src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java	(revision 0)
+++ src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java	(revision 0)
@@ -0,0 +1,99 @@
+package org.apache.nutch.parse.headings;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestHeadingsParseFilter extends TestCase {
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testHeaders.html";
+  private String sampleFileMultival = "testMultivalueHeaders.html";
+
+  public TestHeadingsParseFilter(String name) {
+    super(name);
+  }
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    System.out.println("TT:"+conf.get("plugin.folders"));
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+              new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.toString());
+    }
+    return metadata;
+  }
+
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta= parseMeta(sampleFile, conf);
+
+    assertEquals("This is a test head h1", parseMeta.get("h1"));
+    assertEquals("This is a test head h2", parseMeta.get("h2"));
+  }
+
+  public void testMultiValueMetatags() {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("headings.multivalued", true);
+
+    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+    String failMessage = "One value of metatag with multiple values is missing: ";
+
+    Set<String> valueSet = new TreeSet<String>();
+    for (String val : parseMeta.getValues("h1")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues1 = {"Test header h1", "Test header h1 too"};
+    for (String val : expectedValues1) {
+      assertTrue(failMessage + val, valueSet.contains(val));
+    }
+
+    valueSet.clear();
+    for (String val : parseMeta.getValues("h2")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues2 = {"Test header h2", "Test header h2 too", "Test header h2 with span"};
+    for (String val : expectedValues2) {
+      assertTrue(failMessage + val, valueSet.contains(val));
+    }
+  }
+}
Index: src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
===================================================================
--- src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java	(revision 1567824)
+++ src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java	(working copy)
@@ -19,7 +19,8 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.regex.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
@@ -27,20 +28,28 @@
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NodeWalker;
-import org.w3c.dom.*;
-
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
 /**
  * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
  */
 public class HeadingsParseFilter implements HtmlParseFilter {
-
+  private Configuration conf;
+  
   /**
    * Pattern used to strip surpluss whitespace
    */
   protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-    
-  private Configuration conf;
+  
+  /**
+   * List of headings to collect
+   */
   private String[] headings;
+  
+  /**
+   * Whether we are multi valued, e.g. collect multiple instances of the same heading element
+   */
   private boolean multiValued = false;
 
   public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -85,11 +94,9 @@
 
     while (walker.hasNext()) {
       Node currentNode = walker.nextNode();
-
       if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
         if (element.equalsIgnoreCase(currentNode.getNodeName())) {
           headings.add(getNodeValue(currentNode));
-          
           // Check for multiValued here, if disabled we don't need
           // to discover more headings.
           if (!multiValued) {
@@ -106,13 +113,17 @@
    * Returns the text value of the specified Node and child nodes
    */
   protected static String getNodeValue(Node node) {
+    NodeWalker walker = new NodeWalker(node);
     StringBuilder buffer = new StringBuilder();
 
-    NodeList children = node.getChildNodes();
-
-    for (int i = 0; i < children.getLength(); i++) {
-      if (children.item(i).getNodeType() == Node.TEXT_NODE) {
-        buffer.append(children.item(i).getNodeValue());
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+      
+      NodeList children = currentNode.getChildNodes();
+      for (int i = 0; i < children.getLength(); i++) {
+        if (children.item(i).getNodeType() == Node.TEXT_NODE) {
+          buffer.append(children.item(i).getNodeValue());
+        }
       }
     }
 
Index: src/plugin/headings/build.xml
===================================================================
--- src/plugin/headings/build.xml	(revision 1567824)
+++ src/plugin/headings/build.xml	(working copy)
@@ -19,4 +19,17 @@
 
   <import file="../build-plugin.xml"/>
 
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+      <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+      <ant target="deploy" inheritall="false" dir="../protocol-file" />
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="sample">
+          <include name="*.html" />
+      </fileset>
+  </copy>
 </project>
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 1567824)
+++ src/plugin/build.xml	(working copy)
@@ -91,6 +91,7 @@
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="feed" target="test"/>
+     <ant dir="headings" target="test"/>
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-metatags" target="test"/>
      <ant dir="parse-swf" target="test"/>
