Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1682122)
+++ conf/nutch-default.xml (working copy)
@@ -1661,4 +1661,24 @@
+
+
+
+ selenium.take.screenshot
+ false
+
+ Boolean property determining whether the protocol-selenium
+ WebDriver should capture a screenshot of the URL.
+
+
+
+
+ selenium.screenshot.location
+
+
+ The location on disk where a URL screenshot should be saved
+ to if the selenium.take.screenshot proerty is set to true.
+
+
+
Index: src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
===================================================================
--- src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (revision 1682122)
+++ src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (working copy)
@@ -16,15 +16,19 @@
*/
package org.apache.nutch.protocol.selenium;
+import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.support.ui.WebDriverWait;
+import java.io.File;
import java.lang.String;
public class HttpWebClient {
@@ -45,6 +49,17 @@
};
};
+ /**
+ * Function for obtaining the HTML BODY using the selected
+ * {@link org.openqa.selenium.WebDriver}.
+ * There are a number of configuration properties within
+ * nutch-site.xml
which determine whether to
+ * take screenshots of the rendered pages and persist them
+ * as timestamped .png to local disk.
+ * @param url the URL to fetch and render
+ * @param conf the {@link org.apache.hadoop.conf.Configuration}
+ * @return the rendered inner HTML page
+ */
public static String getHtmlPage(String url, Configuration conf) {
WebDriver driver = null;
@@ -59,6 +74,15 @@
// Wait for the page to load, timeout after 3 seconds
new WebDriverWait(driver, 3);
+
+ if (conf.getBoolean("selenium.take.screenshot", false)) {
+ File scrFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+ LOG.debug("Screenshot taken of: ", url);
+ if (conf.get("selenium.screenshot.location") != null) {
+ FileUtils.copyFile(scrFile, new File(conf.get("selenium.screenshot.location") + File.pathSeparator + url + System.currentTimeMillis() + ".png"));
+ LOG.debug("Screenshot saved to: ", conf.get("selenium.screenshot.location"));
+ }
+ }
String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");