From 43074f082c36ad20bc93d49922823afed51e0a26 Mon Sep 17 00:00:00 2001
From: Cihad Guzel <cihad.guzel@partner.turktelekom.com.tr>
Date: Fri, 20 Sep 2013 17:27:14 +0300
Subject: [PATCH] xpath-plugin-v6.patch

---
 conf/parse-plugins.xml                             |   2 +
 src/plugin/build.xml                               |   2 +
 src/plugin/parse-xpath/build.xml                   |  36 +++
 src/plugin/parse-xpath/ivy.xml                     |  42 +++
 src/plugin/parse-xpath/plugin.xml                  |  46 +++
 src/plugin/parse-xpath/sample/README               |  59 ++++
 .../parse-xpath/sample/xpath-query.xml.template    |  44 +++
 src/plugin/parse-xpath/sample/xpath-query.xsd      |  38 +++
 .../org/apache/nutch/parse/xpath/XPathParser.java  | 322 +++++++++++++++++++++
 .../apache/nutch/parse/xpath/entry/Entries.java    |  69 +++++
 .../org/apache/nutch/parse/xpath/entry/Entry.java  | 181 ++++++++++++
 .../nutch/parse/xpath/entry/ObjectFactory.java     |  69 +++++
 .../org/apache/nutch/parse/xpath/entry/Query.java  | 118 ++++++++
 .../nutch/parse/xpath/entry/package-info.java      |   3 +
 .../java/org/apache/nutch/parse/xpath/package.html |   5 +
 15 files changed, 1036 insertions(+)
 create mode 100644 src/plugin/parse-xpath/build.xml
 create mode 100644 src/plugin/parse-xpath/ivy.xml
 create mode 100644 src/plugin/parse-xpath/plugin.xml
 create mode 100644 src/plugin/parse-xpath/sample/README
 create mode 100644 src/plugin/parse-xpath/sample/xpath-query.xml.template
 create mode 100644 src/plugin/parse-xpath/sample/xpath-query.xsd
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/XPathParser.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entries.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entry.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/ObjectFactory.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Query.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/package-info.java
 create mode 100644 src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/package.html

diff --git a/conf/parse-plugins.xml b/conf/parse-plugins.xml
index 5b20be6..9f7845c 100644
--- a/conf/parse-plugins.xml
+++ b/conf/parse-plugins.xml
@@ -85,6 +85,8 @@
 			extension-id="org.apache.nutch.parse.html.HtmlParser" />
 		<alias name="parse-tika" 
 			extension-id="org.apache.nutch.parse.tika.TikaParser" />
+        <alias name="parse-xpath"
+            extension-id="org.apache.nutch.parse.xpath.XPathParser" />
 		<alias name="parse-ext" extension-id="ExtParser" />
 		<alias name="parse-js" extension-id="JSParser" />
 		<alias name="feed"
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 7c60a63..fd21b58 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -45,6 +45,7 @@
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
+     <ant dir="parse-xpath" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
@@ -118,6 +119,7 @@
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-xpath" target="clean"/>
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
diff --git a/src/plugin/parse-xpath/build.xml b/src/plugin/parse-xpath/build.xml
new file mode 100644
index 0000000..9f55dfd
--- /dev/null
+++ b/src/plugin/parse-xpath/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ written by cihad guzel -  email: c.guzel.src@gmail.com
+
+-->
+<project name="parse-xpath" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+  <mkdir dir="${build.dir}/data"/>
+  <copy todir="${build.dir}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>
diff --git a/src/plugin/parse-xpath/ivy.xml b/src/plugin/parse-xpath/ivy.xml
new file mode 100644
index 0000000..678ff30
--- /dev/null
+++ b/src/plugin/parse-xpath/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="net.sourceforge.htmlcleaner" name="htmlcleaner" rev="2.5"/>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/parse-xpath/plugin.xml b/src/plugin/parse-xpath/plugin.xml
new file mode 100644
index 0000000..22608c1
--- /dev/null
+++ b/src/plugin/parse-xpath/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-xpath"
+   name="XPath Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-xpath.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>      
+   </requires>
+
+   <extension id="org.apache.nutch.parse.xpath"
+              name="XPathParser"
+              point="org.apache.nutch.parse.Parser">
+
+
+      <implementation id="org.apache.nutch.parse.xpath.XPathParser"
+                      class="org.apache.nutch.parse.xpath.XPathParser">
+                      <parameter name="contentType" value="text/html|application/xhtml+xml"/>
+      </implementation>
+
+   </extension>
+
+</plugin>
diff --git a/src/plugin/parse-xpath/sample/README b/src/plugin/parse-xpath/sample/README
new file mode 100644
index 0000000..5edbc24
--- /dev/null
+++ b/src/plugin/parse-xpath/sample/README
@@ -0,0 +1,59 @@
+
+ written by cihad guzel -  email: c.guzel.src@gmail.com
+
+
+1- Must add "conf/xpath-query.xml". You can look parse-xpath/data/xpath-query.xml.template
+
+2- Could parse by fields of "content", "title", "url". If you would like add custom fields (ex:'date' or '') and parse via xpath at "conf/xpath-query.xml".
+You can add one more then "content" field in "conf/xpath-query.xml".
+
+3- Must add "parse-xpath" property in "plugin.includes" property at "conf/nutch-site.xml" as follows:
+	<property>
+        <name>plugin.includes</name>
+        <value>....|parse-(...|xpath)|....</value>
+    </property>
+    
+4- Must add "parse.xpath.plugin.file" property in "conf/nutch-site.xml" as follows:
+	<property>
+  		<name>parse.xpath.plugin.file</name>
+  		<value>xpath-query.xml</value>  
+	</property>    
+
+5- Must add  '<plugin id="parse-xpath" />' in '<mimetype>' tag at "conf/parse-plugins.xml" as follows:
+
+	<mimeType name="text/html">
+		<plugin id="parse-xpath" />
+        <plugin id="parse-html" />
+	</mimeType>
+	
+	<mimeType name="application/xhtml+xml">
+        <plugin id="parse-xpath" />
+		<plugin id="parse-html" />
+	</mimeType>
+	
+	*** '<plugin id="parse-xpath" />' should be added to "first line" in mimetype property. If not, the parse-html plugin is run and not run xpath parser.
+
+
+8- You can add to use as like at parse-xpath/data/xpath-query.xml.template.
+
+9- =========How to use xpath plugin?===========
+
+If you want to parse via xpath, you must add xpath query to xpath-query.xml . You can look at "xpath-query.xml.template" .
+
+===Meanings of tags:
+name: Defination the entry.
+url: parsed url
+row:    If the tag is true and you add query for "url" field (<field>url</field>), the results of xpath is added as a new row at DB.
+    Default value is "true". Is optional tag.
+    If the tag is false, all the xpath results are added in metadata(mtdt) field.
+query:  You can add xpath query and description in the tag. You be able to add more than one.
+field:  The tag define xpath query. Could parse by fields of "content", "title", "url". The fields are required, if row field isn't false.
+    If one of the fields is defined, the xpath query result is written to the corresponding place (title,content,url(base url)) at DB.
+    If you would like add custom fields (ex:'date') excluded of them.
+    If you add custom field,the xpath query result is written to metadata(mtdt) field on DB.
+xpath:  xpath is written in the tag
+description:    Information for entry and query tags
+
+* If you want, you can parse from your own java class via "getXpathResponse(String xpath, String url)" method of XPathParser.java class .
+You must give a xpath query and a parsed url for the method. Return an ArrayList<String>.
+
diff --git a/src/plugin/parse-xpath/sample/xpath-query.xml.template b/src/plugin/parse-xpath/sample/xpath-query.xml.template
new file mode 100644
index 0000000..faca788
--- /dev/null
+++ b/src/plugin/parse-xpath/sample/xpath-query.xml.template
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<entryList xmlns="http://lucene.apache.org/nutch"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://lucene.apache.org/nutch xpath-query.xsd">
+
+  <entry>
+    <name>webrazzi</name>
+    <url>http://www.webrazzi.com/</url>
+    <row>true</row>
+    <description>Blog Site</description>
+
+    <query>
+      <field>title</field>
+      <xpath>//div[@class='entry']/div/h2/a</xpath>
+      <description></description>
+    </query>
+    <query>
+      <field>url</field>
+      <xpath>//div[@class='entry']/div/h2/a/attribute::href</xpath>
+      <description>outlink</description>
+    </query>
+    <query>
+      <field>content</field>
+      <xpath>//div[@class='post']/p[2]</xpath>
+      <description>first content</description>
+    </query>
+    <query>
+      <field>content</field>
+      <xpath>//div[@class='post']/p[3]</xpath>
+      <description>another content</description>
+    </query>
+    <query>
+      <field>date</field>
+      <xpath>//p[@class='postinfo']/text()[2]</xpath>
+      <description></description>
+    </query>
+  </entry>
+
+ <!--
+  <entry>
+    another entry
+  </entry>
+  -->
+</entryList>
\ No newline at end of file
diff --git a/src/plugin/parse-xpath/sample/xpath-query.xsd b/src/plugin/parse-xpath/sample/xpath-query.xsd
new file mode 100644
index 0000000..6895721
--- /dev/null
+++ b/src/plugin/parse-xpath/sample/xpath-query.xsd
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+           targetNamespace="http://lucene.apache.org/nutch"
+           xmlns="http://lucene.apache.org/nutch"
+           elementFormDefault="qualified">
+
+  <xs:element name="entryList" type="Entries"/>
+  <xs:complexType name="Entries">
+    <xs:sequence>
+      <xs:element name="entry" type="Entry" maxOccurs="unbounded" nillable="false"/>
+    </xs:sequence>
+  </xs:complexType>
+
+  <xs:complexType name="Entry">
+    <xs:sequence>
+      <xs:element name="name" type="str-min-1" maxOccurs="1" minOccurs="1" nillable="false"></xs:element>
+      <xs:element name="url" type="xs:string" maxOccurs="1" minOccurs="1" nillable="false"/>
+      <xs:element name="row" type="xs:boolean" default="true" maxOccurs="1" minOccurs="0" nillable="false"/>
+      <xs:element name="description" type="xs:string" maxOccurs="1" minOccurs="1"/>
+      <xs:element name="query" type="Query" maxOccurs="unbounded" nillable="false"/>
+    </xs:sequence>
+  </xs:complexType>
+
+  <xs:complexType name="Query">
+    <xs:sequence>
+      <xs:element name="field" type="str-min-1" maxOccurs="1" minOccurs="1" nillable="false"/>
+      <xs:element name="xpath" type="str-min-1" maxOccurs="1" minOccurs="1" nillable="false"/>
+      <xs:element name="description" type="xs:string" maxOccurs="1" minOccurs="1"/>
+    </xs:sequence>
+  </xs:complexType>
+
+  <xs:simpleType name="str-min-1">
+     <xs:restriction base="xs:string">
+       <xs:minLength value="1"/>
+     </xs:restriction>
+  </xs:simpleType>
+
+</xs:schema>
\ No newline at end of file
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/XPathParser.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/XPathParser.java
new file mode 100644
index 0000000..bc49717
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/XPathParser.java
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author cihad.guzel 
+ * @email: c.guzel.src@gmail.com
+ *
+ */
+
+
+package org.apache.nutch.parse.xpath;//Java
+
+import javax.xml.bind.JAXBContext;
+import javax.xml.bind.JAXBElement;
+import javax.xml.bind.JAXBException;
+import javax.xml.bind.Unmarshaller;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
+
+
+import org.apache.avro.util.Utf8;
+import org.apache.gora.store.DataStore;
+import org.apache.gora.util.GoraException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseStatusCodes;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.xpath.entry.Entries;
+import org.apache.nutch.parse.xpath.entry.Entry;
+import org.apache.nutch.parse.xpath.entry.ObjectFactory;
+import org.apache.nutch.parse.xpath.entry.Query;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.TableUtil;
+import org.htmlcleaner.CleanerProperties;
+import org.htmlcleaner.DomSerializer;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class XPathParser implements Parser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.xpath");
+  private Configuration conf;
+  private static Collection<WebPage.Field> FIELDS = new HashSet<Field>();
+  private DataStore<String, WebPage> store = null;
+
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.CONTENT_TYPE);
+  }
+
+  private static final String QUERY_FILE = "parse.xpath.plugin.file";
+  private static String QUERY_FILE_NAME;
+
+  private HashMap<String, Entry> entryMap = new HashMap<String, Entry>();
+
+  public XPathParser() {
+
+  }
+
+  private void xpathInit() {
+    try {
+      store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
+    } catch (GoraException e) {
+      LOG.error("Webpage is not read from Hbase. Error Message: " + e.getMessage());
+    } catch (Exception e) {
+      LOG.error("Webpage is not read from Hbase. Error Message: " + e.getMessage());
+    }
+
+    JAXBContext jaxbContext = null;
+    JAXBElement<Entries> unmarshalledObject = null;
+    try {
+      jaxbContext = JAXBContext.newInstance(ObjectFactory.class);
+
+      Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
+
+      unmarshalledObject = (JAXBElement<Entries>) unmarshaller
+          .unmarshal(conf.getConfResourceAsInputStream(QUERY_FILE_NAME));
+
+    } catch (JAXBException e) {
+      LOG.error("Xpath parser have some error, ERROR Message: " + e.getMessage());
+    }
+    Entries pagesObj = unmarshalledObject.getValue();
+
+    for (Entry e : pagesObj.getEntry()) {
+      if(e.getName()==null	||	e.getUrl()==null	||	e.getQuery()==null)
+    	  LOG.error("Unexpected format. Please complete for entry name: '"+ e.getName() +"' in "+ QUERY_FILE_NAME);
+      else
+    	entryMap.put(e.getUrl(), e);
+    }
+  }
+
+  public NodeList getNodes(String xpathString, Document doc) throws XPathExpressionException {
+    XPathFactory xPathFactory = XPathFactory.newInstance();
+    XPath xPath = xPathFactory.newXPath();
+
+    return (NodeList) xPath.compile(xpathString)
+        .evaluate(doc, XPathConstants.NODESET);
+  }
+
+  public Document getDoc(WebPage webpage, String url) {
+    ByteBuffer bb = webpage.getContent();
+    HtmlCleaner cleaner = new HtmlCleaner();
+    CleanerProperties props = cleaner.getProperties();
+    props.setAllowHtmlInsideAttributes(true);
+    props.setAllowMultiWordAttributes(true);
+    props.setRecognizeUnicodeChars(true);
+    props.setOmitComments(true);
+
+    TagNode node;
+    Document doc = null;
+
+    try {
+      // node = cleaner.clean(contentStream);
+      node = cleaner.clean(new ByteArrayInputStream(bb.array()));
+      doc = (Document) new DomSerializer(new CleanerProperties())
+          .createDOM(node);
+    } catch (Exception e) {
+
+      LOG.error("The Xpath gives incorrect results for this url: "
+          + url + " . Please you check Xpath at "
+          + QUERY_FILE_NAME + "Error Message: " + e.getMessage());
+    }
+
+    return doc;
+
+
+  }
+
+  public void parse(String url, WebPage webpage) {
+
+    Map<Query, NodeList> responseList = new HashMap<Query, NodeList>();
+    List<Query> queList = entryMap.get(url).getQuery();
+    for (Query que : queList) {
+      try {
+        NodeList nodeList = getNodes(que.getXpath(), getDoc(webpage, url));
+        responseList.put(que, nodeList);
+      } catch (XPathExpressionException e) {
+        LOG.error("The Xpath gives incorrect results for this url: "
+            + url + " . Please you check Xpath at "
+            + QUERY_FILE_NAME + "Error Message: " + e.getMessage());
+      }
+    }
+
+    int length = 0;
+    List<WebPage> pageList = new ArrayList<WebPage>();
+
+    for (Map.Entry<Query, NodeList> entry : responseList.entrySet()) {
+      if (length == 0)
+        length = entry.getValue().getLength();
+      else if (length != entry.getValue().getLength()) {
+        LOG.error("The Xpath gives incorrect results for this url: "
+            + url + " . Please you check Xpath at "
+            + QUERY_FILE_NAME);
+        length = -1;
+        break;
+      }
+      Query q = (Query) entry.getKey();
+      NodeList nList = (NodeList) entry.getValue();
+
+      if (pageList.isEmpty()) {
+        for (int i = 0; i < nList.getLength(); i++) {
+          pageList.add(new WebPage());
+        }
+      }
+      if (entryMap.get(url).isRow()) {
+        if ("content".equals(q.getField()))
+          for (int i = 0; i < nList.getLength(); i++) {
+            WebPage p = pageList.get(i);
+            if (p.getText() == null)
+              pageList.get(i).setText(new Utf8(entry.getValue().item(i).getTextContent()));
+            else
+              p.setText(new Utf8(p.getText() + " - " + entry.getValue().item(i).getTextContent()));
+
+          }
+        else if ("title".equals(q.getField()))
+          for (int i = 0; i < nList.getLength(); i++) {
+            pageList.get(i).setTitle(new Utf8(entry.getValue().item(i).getTextContent()));
+          }
+        else if ("url".equals(q.getField()))
+          for (int i = 0; i < nList.getLength(); i++) {
+            pageList.get(i).setBaseUrl(new Utf8(entry.getValue().item(i).getTextContent()));
+          }
+        else
+          for (int i = 0; i < nList.getLength(); i++) {
+            ByteBuffer buf = ByteBuffer.wrap(entry.getValue().item(i).getTextContent().getBytes());
+            pageList.get(i).putToMetadata(new Utf8(entry.getKey().getField()), buf);
+          }
+      } else {
+        for (int i = 0; i < nList.getLength(); i++) {
+          ByteBuffer buf = ByteBuffer.wrap(entry.getValue().item(i).getTextContent().getBytes());
+          pageList.get(i).putToMetadata(new Utf8(entry.getKey().getField()), buf);
+        }
+      }
+
+    }
+    if (length > 0) {
+      for (WebPage page : pageList) {
+        try {
+          Mark.PARSE_MARK.putMark(page, "y");
+          Date d = new Date();
+          page.setFetchTime(d.getTime());
+
+          ParseStatus status = new ParseStatus();
+          status.setMajorCode(ParseStatusCodes.SUCCESS);
+          page.setParseStatus(status);
+
+          //TODO check this line (Is signature field important? )
+          page.setSignature(ByteBuffer.wrap("0".getBytes()));
+
+          store.put(TableUtil.reverseUrl(page.getBaseUrl().toString()), page);
+        } catch (IOException e) {
+          LOG.error("Parse response not write to Webpage on Hbase. Error Message: " + e.getMessage());
+        }
+      }
+
+      store.flush();
+
+    }
+  }
+
+  @Override
+  public Parse getParse(String url, WebPage webpage) {
+    LOG.info("Xpath plugin is started for "+ url);
+	xpathInit();
+    Iterator it = entryMap.entrySet().iterator();
+    while (it.hasNext()) {
+      Map.Entry pairs = (Map.Entry) it.next();
+      Pattern r = Pattern.compile((String) pairs.getKey());
+      Matcher m = r.matcher(url);
+
+      if (m.find()) {
+
+        parse(url, webpage);
+      }
+    }
+
+    LOG.info("Xpath plugin is ended for "+ url);
+    return new Parse();
+  }
+
+
+  @Override
+  public Collection<Field> getFields() {
+    return null;
+  }
+
+  public ArrayList<String> getXpathResponse(String xpath, String url) throws IOException, ClassNotFoundException {
+    WebPage webpage = store.get(TableUtil.reverseUrl(url));
+    NodeList nodeList = null;
+    try {
+      nodeList = getNodes(xpath, getDoc(webpage, url));
+    } catch (XPathExpressionException e) {
+      LOG.error("The Xpath gives incorrect results for this url: "
+          + url + " . Please you check Xpath at "
+          + QUERY_FILE_NAME + "Error Message: " + e.getMessage());
+    }
+    ArrayList<String> response = new ArrayList<String>();
+
+    WebPage page = null;
+    if (nodeList != null && nodeList.getLength() > 0) {
+      for (int i = 0; i < nodeList.getLength(); i++) {
+        response.add(nodeList.item(i).getTextContent());
+      }
+    } else {
+      return null;
+    }
+    return response;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.QUERY_FILE_NAME = conf.get(QUERY_FILE);
+
+  }
+
+}
\ No newline at end of file
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entries.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entries.java
new file mode 100644
index 0000000..3a1c3ea
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entries.java
@@ -0,0 +1,69 @@
+
+package org.apache.nutch.parse.xpath.entry;
+
+import java.util.ArrayList;
+import java.util.List;
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlType;
+
+
+/**
+ * <p>Java class for Entries complex type.
+ * 
+ * <p>The following schema fragment specifies the expected content contained within this class.
+ * 
+ * <pre>
+ * &lt;complexType name="Entries">
+ *   &lt;complexContent>
+ *     &lt;restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
+ *       &lt;sequence>
+ *         &lt;element name="entry" type="{http://lucene.apache.org/nutch}Entry" maxOccurs="unbounded"/>
+ *       &lt;/sequence>
+ *     &lt;/restriction>
+ *   &lt;/complexContent>
+ * &lt;/complexType>
+ * </pre>
+ * 
+ * 
+ */
+@XmlAccessorType(XmlAccessType.FIELD)
+@XmlType(name = "Entries", propOrder = {
+    "entry"
+})
+public class Entries {
+
+    @XmlElement(required = true)
+    protected List<Entry> entry;
+
+    /**
+     * Gets the value of the entry property.
+     * 
+     * <p>
+     * This accessor method returns a reference to the live list,
+     * not a snapshot. Therefore any modification you make to the
+     * returned list will be present inside the JAXB object.
+     * This is why there is not a <CODE>set</CODE> method for the entry property.
+     * 
+     * <p>
+     * For example, to add a new item, do as follows:
+     * <pre>
+     *    getEntry().add(newItem);
+     * </pre>
+     * 
+     * 
+     * <p>
+     * Objects of the following type(s) are allowed in the list
+     * {@link Entry }
+     * 
+     * 
+     */
+    public List<Entry> getEntry() {
+        if (entry == null) {
+            entry = new ArrayList<Entry>();
+        }
+        return this.entry;
+    }
+
+}
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entry.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entry.java
new file mode 100644
index 0000000..b8eddcb
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Entry.java
@@ -0,0 +1,181 @@
+
+package org.apache.nutch.parse.xpath.entry;
+
+import java.util.ArrayList;
+import java.util.List;
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlType;
+
+
+/**
+ * <p>Java class for Entry complex type.
+ * 
+ * <p>The following schema fragment specifies the expected content contained within this class.
+ * 
+ * <pre>
+ * &lt;complexType name="Entry">
+ *   &lt;complexContent>
+ *     &lt;restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
+ *       &lt;sequence>
+ *         &lt;element name="name" type="{http://lucene.apache.org/nutch}str-min-1"/>
+ *         &lt;element name="url" type="{http://www.w3.org/2001/XMLSchema}string"/>
+ *         &lt;element name="row" type="{http://www.w3.org/2001/XMLSchema}boolean" minOccurs="0"/>
+ *         &lt;element name="description" type="{http://www.w3.org/2001/XMLSchema}string"/>
+ *         &lt;element name="query" type="{http://lucene.apache.org/nutch}Query" maxOccurs="unbounded"/>
+ *       &lt;/sequence>
+ *     &lt;/restriction>
+ *   &lt;/complexContent>
+ * &lt;/complexType>
+ * </pre>
+ * 
+ * 
+ */
+@XmlAccessorType(XmlAccessType.FIELD)
+@XmlType(name = "Entry", propOrder = {
+    "name",
+    "url",
+    "row",
+    "description",
+    "query"
+})
+public class Entry {
+
+    @XmlElement(required = true)
+    protected String name;
+    @XmlElement(required = true)
+    protected String url;
+    @XmlElement(defaultValue = "true")
+    protected Boolean row;
+    @XmlElement(required = true)
+    protected String description;
+    @XmlElement(required = true)
+    protected List<Query> query;
+
+    /**
+     * Gets the value of the name property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * Sets the value of the name property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setName(String value) {
+        this.name = value;
+    }
+
+    /**
+     * Gets the value of the url property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getUrl() {
+        return url;
+    }
+
+    /**
+     * Sets the value of the url property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setUrl(String value) {
+        this.url = value;
+    }
+
+    /**
+     * Gets the value of the row property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link Boolean }
+     *     
+     */
+    public Boolean isRow() {
+        return row;
+    }
+
+    /**
+     * Sets the value of the row property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link Boolean }
+     *     
+     */
+    public void setRow(Boolean value) {
+        this.row = value;
+    }
+
+    /**
+     * Gets the value of the description property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getDescription() {
+        return description;
+    }
+
+    /**
+     * Sets the value of the description property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setDescription(String value) {
+        this.description = value;
+    }
+
+    /**
+     * Gets the value of the query property.
+     * 
+     * <p>
+     * This accessor method returns a reference to the live list,
+     * not a snapshot. Therefore any modification you make to the
+     * returned list will be present inside the JAXB object.
+     * This is why there is not a <CODE>set</CODE> method for the query property.
+     * 
+     * <p>
+     * For example, to add a new item, do as follows:
+     * <pre>
+     *    getQuery().add(newItem);
+     * </pre>
+     * 
+     * 
+     * <p>
+     * Objects of the following type(s) are allowed in the list
+     * {@link Query }
+     * 
+     * 
+     */
+    public List<Query> getQuery() {
+        if (query == null) {
+            query = new ArrayList<Query>();
+        }
+        return this.query;
+    }
+
+}
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/ObjectFactory.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/ObjectFactory.java
new file mode 100644
index 0000000..468c746
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/ObjectFactory.java
@@ -0,0 +1,69 @@
+
+package org.apache.nutch.parse.xpath.entry;
+
+import javax.xml.bind.JAXBElement;
+import javax.xml.bind.annotation.XmlElementDecl;
+import javax.xml.bind.annotation.XmlRegistry;
+import javax.xml.namespace.QName;
+
+
+/**
+ * This object contains factory methods for each 
+ * Java content interface and Java element interface 
+ * generated in the org.apache.lucene.nutch package. 
+ * <p>An ObjectFactory allows you to programatically 
+ * construct new instances of the Java representation 
+ * for XML content. The Java representation of XML 
+ * content can consist of schema derived interfaces 
+ * and classes representing the binding of schema 
+ * type definitions, element declarations and model 
+ * groups.  Factory methods for each of these are 
+ * provided in this class.
+ * 
+ */
+@XmlRegistry
+public class ObjectFactory {
+
+    private final static QName _EntryList_QNAME = new QName("http://lucene.apache.org/nutch", "entryList");
+
+    /**
+     * Create a new ObjectFactory that can be used to create new instances of schema derived classes for package: org.apache.lucene.nutch
+     * 
+     */
+    public ObjectFactory() {
+    }
+
+    /**
+     * Create an instance of {@link Entries }
+     * 
+     */
+    public Entries createEntries() {
+        return new Entries();
+    }
+
+    /**
+     * Create an instance of {@link Entry }
+     * 
+     */
+    public Entry createEntry() {
+        return new Entry();
+    }
+
+    /**
+     * Create an instance of {@link Query }
+     * 
+     */
+    public Query createQuery() {
+        return new Query();
+    }
+
+    /**
+     * Create an instance of {@link JAXBElement }{@code <}{@link Entries }{@code >}}
+     * 
+     */
+    @XmlElementDecl(namespace = "http://lucene.apache.org/nutch", name = "entryList")
+    public JAXBElement<Entries> createEntryList(Entries value) {
+        return new JAXBElement<Entries>(_EntryList_QNAME, Entries.class, null, value);
+    }
+
+}
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Query.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Query.java
new file mode 100644
index 0000000..6d8b675
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/Query.java
@@ -0,0 +1,118 @@
+
+package org.apache.nutch.parse.xpath.entry;
+
+import javax.xml.bind.annotation.XmlAccessType;
+import javax.xml.bind.annotation.XmlAccessorType;
+import javax.xml.bind.annotation.XmlElement;
+import javax.xml.bind.annotation.XmlType;
+
+
+/**
+ * <p>Java class for Query complex type.
+ * 
+ * <p>The following schema fragment specifies the expected content contained within this class.
+ * 
+ * <pre>
+ * &lt;complexType name="Query">
+ *   &lt;complexContent>
+ *     &lt;restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
+ *       &lt;sequence>
+ *         &lt;element name="field" type="{http://lucene.apache.org/nutch}str-min-1"/>
+ *         &lt;element name="xpath" type="{http://lucene.apache.org/nutch}str-min-1"/>
+ *         &lt;element name="description" type="{http://www.w3.org/2001/XMLSchema}string"/>
+ *       &lt;/sequence>
+ *     &lt;/restriction>
+ *   &lt;/complexContent>
+ * &lt;/complexType>
+ * </pre>
+ * 
+ * 
+ */
+@XmlAccessorType(XmlAccessType.FIELD)
+@XmlType(name = "Query", propOrder = {
+    "field",
+    "xpath",
+    "description"
+})
+public class Query {
+
+    @XmlElement(required = true)
+    protected String field;
+    @XmlElement(required = true)
+    protected String xpath;
+    @XmlElement(required = true)
+    protected String description;
+
+    /**
+     * Gets the value of the field property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getField() {
+        return field;
+    }
+
+    /**
+     * Sets the value of the field property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setField(String value) {
+        this.field = value;
+    }
+
+    /**
+     * Gets the value of the xpath property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getXpath() {
+        return xpath;
+    }
+
+    /**
+     * Sets the value of the xpath property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setXpath(String value) {
+        this.xpath = value;
+    }
+
+    /**
+     * Gets the value of the description property.
+     * 
+     * @return
+     *     possible object is
+     *     {@link String }
+     *     
+     */
+    public String getDescription() {
+        return description;
+    }
+
+    /**
+     * Sets the value of the description property.
+     * 
+     * @param value
+     *     allowed object is
+     *     {@link String }
+     *     
+     */
+    public void setDescription(String value) {
+        this.description = value;
+    }
+
+}
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/package-info.java b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/package-info.java
new file mode 100644
index 0000000..4b4238d
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/entry/package-info.java
@@ -0,0 +1,3 @@
+
+@javax.xml.bind.annotation.XmlSchema(namespace = "http://lucene.apache.org/nutch", elementFormDefault = javax.xml.bind.annotation.XmlNsForm.QUALIFIED)
+package org.apache.nutch.parse.xpath.entry;
diff --git a/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/package.html b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/package.html
new file mode 100644
index 0000000..84f53ee
--- /dev/null
+++ b/src/plugin/parse-xpath/src/java/org/apache/nutch/parse/xpath/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>An HTML document parsing plugin via xpath.</p>
+</body>
+</html>
-- 
1.8.1.4

