### Eclipse Workspace Patch 1.0
#P nutch1.7
Index: src/java/org/apache/nutch/parse/ParseUtil.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseUtil.java (revision 1573324)
+++ src/java/org/apache/nutch/parse/ParseUtil.java (working copy)
@@ -47,7 +47,10 @@
private ParserFactory parserFactory;
/** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
private int maxParseTime = 30;
- private ExecutorService executorService;
+ //In one map, the parsing thread number
+ private int nThreads = 10;
+ //In the same map, reuse the ExecutorService
+ private static ExecutorService executorService;
/**
*
@@ -56,8 +59,14 @@
public ParseUtil(Configuration conf) {
this.parserFactory = new ParserFactory(conf);
maxParseTime=conf.getInt("parser.timeout", 30);
- executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
- .setNameFormat("parse-%d").setDaemon(true).build());
+ nThreads=conf.getInt("parse.threads.per.map", 10);
+ //Don't worry about synchronized problem
+ //this method was called by map method
+ //map processes record one by one.
+ if(executorService==null){
+ executorService = Executors.newFixedThreadPool(nThreads, new ThreadFactoryBuilder()
+ .setNameFormat("parse-%d").setDaemon(true).build());
+ }
}
/**
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 1573324)
+++ conf/nutch-default.xml (working copy)
@@ -1005,8 +1005,17 @@
-
+ parse.threads.per.map
+ 10
+ The number of parsing threads that the parsing utilities should use
+ in one map. This number determines the maximum number of ThreadPool that are made
+ at first parse content(each map use one independent ThreadPool). The total number of
+ threads running in distributed mode will be
+ the number of parsing threads * number of map task per node * number of node.
+
+
+
parse.plugin.file
parse-plugins.xml
The name of the file that defines the associations between