Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1682122)
+++ conf/nutch-default.xml (working copy)
@@ -1661,4 +1661,24 @@
+
+
+
+ selenium.take.screenshot
+ false
+
+ Boolean property determining whether the protocol-selenium
+ WebDriver should capture a screenshot of the URL.
+
+
+
+
+ selenium.screenshot.location
+
+
+ The location on disk where a URL screenshot should be saved
+ to if the selenium.take.screenshot proerty is set to true.
+
+
+
Index: src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
===================================================================
--- src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (revision 1682122)
+++ src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (working copy)
@@ -16,15 +16,19 @@
*/
package org.apache.nutch.protocol.selenium;
+import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.support.ui.WebDriverWait;
+import java.io.File;
import java.lang.String;
public class HttpWebClient {
@@ -59,6 +63,14 @@
// Wait for the page to load, timeout after 3 seconds
new WebDriverWait(driver, 3);
+
+ if (conf.getBoolean("selenium.take.screenshot", false)) {
+ File scrFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+ LOG.debug("Screenshot taken of: ", url);
+ if (conf.get("selenium.screenshot.location") != null) {
+ FileUtils.copyFile(scrFile, new File(conf.get("selenium.screenshot.location") + url + ".png"));
+ }
+ }
String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");