diff --git build.xml build.xml
index c36d7ac..c29ba7c 100644
--- build.xml
+++ build.xml
@@ -933,6 +933,7 @@
-->
+
diff --git conf/elasticsearch.conf conf/elasticsearch.conf
new file mode 100644
index 0000000..c28c50b
--- /dev/null
+++ conf/elasticsearch.conf
@@ -0,0 +1,2 @@
+# Settings for Elasticsearch indexer plugin
+# Format: key=value\n
\ No newline at end of file
diff --git conf/nutch-default.xml conf/nutch-default.xml
index 39c0930..271194d 100644
--- conf/nutch-default.xml
+++ conf/nutch-default.xml
@@ -1,1262 +1,1428 @@
-
-
+
+
-
+
-
-
-
- store.ip.address
- false
- Enables us to capture the specific IP address
- (InetSocketAddress) of the host which we connect to via
- the given protocol.
-
-
-
-
-
-
- file.content.limit
- 65536
- The length limit for downloaded content using the file
- protocol, in bytes. If this value is nonnegative (>=0), content longer
- than it will be truncated; otherwise, no truncation at all. Do not
- confuse this setting with the http.content.limit setting.
-
-
-
-
- file.content.ignored
- true
- If true, no file content will be saved during fetch.
- And it is probably what we want to set most of time, since file:// URLs
- are meant to be local and we can always use them directly at parsing
- and indexing stages. Otherwise file contents will be saved.
- !! NO IMPLEMENTED YET !!
-
-
-
-
- file.crawl.parent
- true
- The crawler is not restricted to the directories that you specified in the
- urls file but it is jumping into the parent directories as well. For your own crawling you can
- change this behavior (set to false) the way that only directories beneath the directories that you specify get
- crawled.
-
-
-
-
-
-
-
- http.agent.name
-
- HTTP 'User-Agent' request header. MUST NOT be empty -
- please set this to a single word uniquely related to your organization.
-
- NOTE: You should also check other related properties:
-
- http.robots.agents
- http.agent.description
- http.agent.url
- http.agent.email
- http.agent.version
-
- and set their values appropriately.
-
-
-
-
-
- http.robots.agents
- *
- The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence. You should
- put the value of http.agent.name as the first agent name, and keep the
- default * at the end of the list. E.g.: BlurflDev,Blurfl,*
-
-
-
-
- http.robots.403.allow
- true
- Some servers return HTTP status 403 (Forbidden) if
- /robots.txt doesn't exist. This should probably mean that we are
- allowed to crawl the site nonetheless. If this is set to false,
- then such sites will be treated as forbidden.
-
-
-
- http.agent.description
-
- Further description of our bot- this text is used in
- the User-Agent header. It appears in parenthesis after the agent name.
-
-
-
-
- http.agent.url
-
- A URL to advertise in the User-Agent header. This will
- appear in parenthesis after the agent name. Custom dictates that this
- should be a URL of a page explaining the purpose and behavior of this
- crawler.
-
-
-
-
- http.agent.email
-
- An email address to advertise in the HTTP 'From' request
- header and User-Agent header. A good practice is to mangle this
- address (e.g. 'info at example dot com') to avoid spamming.
-
-
-
-
- http.agent.version
- Nutch-2.3-SNAPSHOT
- A version string to advertise in the User-Agent
- header.
-
-
-
- http.agent.host
-
- Name or IP address of the host on which the Nutch crawler
- would be running. Currently this is used by 'protocol-httpclient'
- plugin.
-
-
-
-
- http.timeout
- 10000
- The default network timeout, in milliseconds.
-
-
-
- http.max.delays
- 100
- The number of times a thread will delay when trying to
- fetch a page. Each time it finds that a host is busy, it will wait
- fetcher.server.delay. After http.max.delays attepts, it will give
- up on the page for now.
-
-
-
- http.content.limit
- 65536
- The length limit for downloaded content using the http
- protocol, in bytes. If this value is nonnegative (>=0), content longer
- than it will be truncated; otherwise, no truncation at all. Do not
- confuse this setting with the file.content.limit setting.
-
-
-
-
- http.proxy.host
-
- The proxy hostname. If empty, no proxy is used.
-
-
-
- http.proxy.port
-
- The proxy port.
-
-
-
- http.proxy.username
-
- Username for proxy. This will be used by
- 'protocol-httpclient', if the proxy server requests basic, digest
- and/or NTLM authentication. To use this, 'protocol-httpclient' must
- be present in the value of 'plugin.includes' property.
- NOTE: For NTLM authentication, do not prefix the username with the
- domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
-
-
-
-
- http.proxy.password
-
- Password for proxy. This will be used by
- 'protocol-httpclient', if the proxy server requests basic, digest
- and/or NTLM authentication. To use this, 'protocol-httpclient' must
- be present in the value of 'plugin.includes' property.
-
-
-
-
- http.proxy.realm
-
- Authentication realm for proxy. Do not define a value
- if realm is not required or authentication should take place for any
- realm. NTLM does not use the notion of realms. Specify the domain name
- of NTLM authentication as the value for this property. To use this,
- 'protocol-httpclient' must be present in the value of
- 'plugin.includes' property.
-
-
-
-
- http.auth.file
- httpclient-auth.xml
- Authentication configuration file for
- 'protocol-httpclient' plugin.
-
-
-
-
- http.verbose
- false
- If true, HTTP will log more verbosely.
-
-
-
- http.useHttp11
- false
- NOTE: at the moment this works only for protocol-httpclient.
- If true, use HTTP 1.1, if false use HTTP 1.0 .
-
-
-
-
- http.accept.language
- en-us,en-gb,en;q=0.7,*;q=0.3
- Value of the "Accept-Language" request header field.
- This allows selecting non-English language as default one to retrieve.
- It is a useful setting for search engines build for certain national group.
-
-
-
-
- http.accept
- text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
- Value of the "Accept" request header field.
-
-
-
-
- http.store.responsetime
- true
- Enables us to record the response time of the
- host which is the time period between start connection to end
- connection of a pages host.
-
-
-
-
-
- ftp.username
- anonymous
- ftp login username.
-
-
-
- ftp.password
- anonymous@example.com
- ftp login password.
-
-
-
- ftp.content.limit
- 65536
- The length limit for downloaded content, in bytes.
- If this value is nonnegative (>=0), content longer than it will be truncated;
- otherwise, no truncation at all.
- Caution: classical ftp RFCs never defines partial transfer and, in fact,
- some ftp servers out there do not handle client side forced close-down very
- well. Our implementation tries its best to handle such situations smoothly.
-
-
-
-
- ftp.timeout
- 60000
- Default timeout for ftp client socket, in millisec.
- Please also see ftp.keep.connection below.
-
-
-
- ftp.server.timeout
- 100000
- An estimation of ftp server idle time, in millisec.
- Typically it is 120000 millisec for many ftp servers out there.
- Better be conservative here. Together with ftp.timeout, it is used to
- decide if we need to delete (annihilate) current ftp.client instance and
- force to start another ftp.client instance anew. This is necessary because
- a fetcher thread may not be able to obtain next request from queue in time
- (due to idleness) before our ftp client times out or remote server
- disconnects. Used only when ftp.keep.connection is true (please see below).
-
-
-
-
- ftp.keep.connection
- false
- Whether to keep ftp connection. Useful if crawling same host
- again and again. When set to true, it avoids connection, login and dir list
- parser setup for subsequent urls. If it is set to true, however, you must
- make sure (roughly):
- (1) ftp.timeout is less than ftp.server.timeout
- (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
- Otherwise there will be too many "delete client because idled too long"
- messages in thread logs.
-
-
-
- ftp.follow.talk
- false
- Whether to log dialogue between our client and remote
- server. Useful for debugging.
-
-
-
-
-
- db.fetch.interval.default
- 2592000
- The default number of seconds between re-fetches of a page (30 days).
-
-
-
-
- db.fetch.interval.max
- 7776000
- The maximum number of seconds between re-fetches of a page
- (90 days). After this period every page in the db will be re-tried, no
- matter what is its status.
-
-
-
-
- db.fetch.schedule.class
- org.apache.nutch.crawl.DefaultFetchSchedule
- The implementation of fetch schedule. DefaultFetchSchedule simply
- adds the original fetchInterval to the last fetch time, regardless of
- page changes.
-
-
-
- db.fetch.schedule.adaptive.inc_rate
- 0.4
- If a page is unmodified, its fetchInterval will be
- increased by this rate. This value should not
- exceed 0.5, otherwise the algorithm becomes unstable.
-
-
-
- db.fetch.schedule.adaptive.dec_rate
- 0.2
- If a page is modified, its fetchInterval will be
- decreased by this rate. This value should not
- exceed 0.5, otherwise the algorithm becomes unstable.
-
-
-
- db.fetch.schedule.adaptive.min_interval
- 60.0
- Minimum fetchInterval, in seconds.
-
-
-
- db.fetch.schedule.adaptive.max_interval
- 31536000.0
- Maximum fetchInterval, in seconds (365 days).
- NOTE: this is limited by db.fetch.interval.max. Pages with
- fetchInterval larger than db.fetch.interval.max
- will be fetched anyway.
-
-
-
- db.fetch.schedule.adaptive.sync_delta
- true
- If true, try to synchronize with the time of page change.
- by shifting the next fetchTime by a fraction (sync_rate) of the difference
- between the last modification time, and the last fetch time.
-
-
-
- db.fetch.schedule.adaptive.sync_delta_rate
- 0.3
- See sync_delta for description. This value should not
- exceed 0.5, otherwise the algorithm becomes unstable.
-
-
-
- db.update.additions.allowed
- true
- If true, updatedb will add newly discovered URLs, if false
- only already existing URLs in the CrawlDb will be updated and no new
- URLs will be added.
-
-
-
-
- db.update.max.inlinks
- 10000
- Maximum number of inlinks to take into account when updating
- a URL score in the crawlDB. Only the best scoring inlinks are kept.
-
-
-
-
- db.ignore.internal.links
- true
- If true, when adding new links to a page, links from
- the same host are ignored. This is an effective way to limit the
- size of the link database, keeping only the highest quality
- links.
-
-
-
-
- db.ignore.external.links
- false
- If true, outlinks leading from a page to external hosts
- will be ignored. This is an effective way to limit the crawl to include
- only initially injected hosts, without creating complex URLFilters.
-
-
-
-
- db.score.injected
- 1.0
- The score of new pages added by the injector.
-
-
-
-
- db.score.link.external
- 1.0
- The score factor for new pages added due to a link from
- another host relative to the referencing page's score. Scoring plugins
- may use this value to affect initial scores of external links.
-
-
-
-
- db.score.link.internal
- 1.0
- The score factor for pages added due to a link from the
- same host, relative to the referencing page's score. Scoring plugins
- may use this value to affect initial scores of internal links.
-
-
-
-
- db.score.count.filtered
- false
- The score value passed to newly discovered pages is
- calculated as a fraction of the original page score divided by the
- number of outlinks. If this option is false, only the outlinks that passed
- URLFilters will count, if it's true then all outlinks will count.
-
-
-
-
- db.max.outlinks.per.page
- 100
- The maximum number of outlinks that we'll process for a page.
- If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
- will be processed for a page; otherwise, all outlinks will be processed.
-
-
-
-
- db.max.anchor.length
- 100
- The maximum number of characters permitted in an anchor.
-
-
-
-
- db.parsemeta.to.crawldb
-
- Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
- Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
- will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
-
-
-
-
- db.fetch.retry.max
- 3
- The maximum number of times a url that has encountered
- recoverable errors is generated for fetch.
-
-
-
- db.signature.class
- org.apache.nutch.crawl.MD5Signature
- The default implementation of a page signature. Signatures
- created with this implementation will be used for duplicate detection
- and removal.
-
-
-
- db.signature.text_profile.min_token_len
- 2
- Minimum token length to be included in the signature.
-
-
-
-
- db.signature.text_profile.quant_rate
- 0.01
- Profile frequencies will be rounded down to a multiple of
- QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
- frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
- for longer texts tokens with frequency 1 will always be discarded.
-
-
-
-
-
-
- generate.max.count
- -1
- The maximum number of urls in a single
- fetchlist. -1 if unlimited. The urls are counted according
- to the value of the parameter generator.count.mode.
-
-
-
-
- generate.max.distance
- -1
- The maximum distance of an URL that the generator is allowed
- to select for fetch. The distance is the smallest number of nodes (shortest path)
- of an URL from the original injected URL. (Injected URLs have distance 0).
-
-
-
-
- generate.count.mode
- host
- Determines how the URLs are counted for generator.max.count.
- Default value is 'host' but can be 'domain'. Note that we do not count
- per IP in the new version of the Generator.
-
-
-
-
- generate.update.crawldb
- false
- For highly-concurrent environments, where several
- generate/fetch/update cycles may overlap, setting this to true ensures
- that generate will create different fetchlists even without intervening
- updatedb-s, at the cost of running an additional job to update CrawlDB.
- If false, running generate twice without intervening
- updatedb will generate identical fetchlists.
-
-
-
-
- partition.url.mode
- byHost
- Determines how to partition URLs. Default value is 'byHost',
- also takes 'byDomain' or 'byIP'.
-
-
-
-
- crawl.gen.delay
- 604800000
-
- This value, expressed in days, defines how long we should keep the lock on records
- in CrawlDb that were just selected for fetching. If these records are not updated
- in the meantime, the lock is canceled, i.e. the become eligible for selecting.
- Default value of this is 7 days.
-
-
-
-
-
-
- fetcher.server.delay
- 5.0
- The number of seconds the fetcher will delay between
- successive requests to the same server.
-
-
-
- fetcher.server.min.delay
- 0.0
- The minimum number of seconds the fetcher will delay between
- successive requests to the same server. This value is applicable ONLY
- if fetcher.threads.per.host is greater than 1 (i.e. the host blocking
- is turned off).
-
-
-
- fetcher.max.crawl.delay
- 30
-
- If the Crawl-Delay in robots.txt is set to greater than this value (in
- seconds) then the fetcher will skip this page, generating an error report.
- If set to -1 the fetcher will never skip such pages and will wait the
- amount of time retrieved from robots.txt Crawl-Delay, however long that
- might be.
-
-
-
-
- fetcher.threads.fetch
- 10
- The number of FetcherThreads the fetcher should use.
- This is also determines the maximum number of requests that are
- made at once (each FetcherThread handles one connection). The total
- number of threads running in distributed mode will be the number of
- fetcher threads * number of nodes as fetcher has one map task per node.
-
-
-
-
- fetcher.threads.per.queue
- 1
- This number is the maximum number of threads that
- should be allowed to access a queue at one time.
-
-
-
- fetcher.queue.mode
- byHost
- Determines how the URLs are placed into queues.
- Allowed values are 'byHost', 'byDomain' and 'byIP'.
- The value would usually correspond to that of 'partition.url.mode'.
-
-
-
-
- fetcher.queue.use.host.settings
- false
- Allows us to optionally enable host specific queue behavior if present.
-
-
-
-
- fetcher.verbose
- false
- If true, fetcher will log more verbosely.
-
-
-
- fetcher.parse
- false
- If true, fetcher will parse content. NOTE: previous releases would
- default to true. Since 2.0 this is set to false as a safer default.
-
-
-
- fetcher.store.content
- true
- If true, fetcher will store content.
-
-
-
- fetcher.timelimit.mins
- -1
- This is the number of minutes allocated to the fetching.
- Once this value is reached, any remaining entry from the input URL list is skipped
- and all active queues are emptied. The default value of -1 deactivates the time limit.
-
-
-
-
- fetcher.max.exceptions.per.queue
- -1
- The maximum number of protocol-level exceptions (e.g. timeouts) per
- host (or IP) queue. Once this value is reached, any remaining entries from this
- queue are purged, effectively stopping the fetching from this host/IP. The default
- value of -1 deactivates this limit.
-
-
-
-
- fetcher.throughput.threshold.pages
- -1
- The threshold of minimum pages per second. If the fetcher downloads less
- pages per second than the configured threshold, the fetcher stops, preventing slow queue's
- from stalling the throughput. This threshold must be an integer. This can be useful when
- fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
-
-
-
-
- fetcher.throughput.threshold.sequence
- 5
- The number of times the fetcher.throughput.threshold is allowed to be exceeded,
- in a row. This setting prevents accidental slow downs from stopping the fetcher.
-
-
-
-
- fetcher.throughput.threshold.check.after
- 5
- The number of minutes after which the throughput check is enabled.
-
-
-
- fetcher.queue.depth.multiplier
- 50
- (EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP]
- (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter.
- A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list
- is not optimal.
-
-
-
-
-
-
- indexingfilter.order
-
- The order by which index filters are applied.
- If empty, all available index filters (as dictated by properties
- plugin-includes and plugin-excludes above) are loaded and applied in system
- defined order. If not empty, only named filters are loaded and applied
- in given order. For example, if this property has value:
- org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
- then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
-
- Filter ordering might have impact on result if one filter depends on output of
- another filter.
-
-
-
-
- indexer.score.power
- 0.5
- Used by the OPIC plugin. Determines the power of link analyis scores.
- Each pages's boost is set to scorescorePower where
- score is its link analysis score and scorePower is the
- value of this parameter. This is compiled into indexes, so, when
- this is changed, pages must be re-indexed for it to take
- effect.
-
-
-
-
-
- indexer.max.title.length
- 100
- The maximum number of characters of a title that are indexed. A value of -1 disables this check.
- Used by index-basic.
-
-
-
-
-
-
- moreIndexingFilter.indexMimeTypeParts
- true
- Determines whether the index-more plugin will split the mime-type
- in sub parts, this requires the type field to be multi valued. Set to true for backward
- compatibility. False will not split the mime-type.
-
-
-
-
-
-
- anchorIndexingFilter.deduplicate
- false
- With this enabled the indexer will case-insensitive deduplicate hanchors
- before indexing. This prevents possible hundreds or thousands of identical anchors for
- a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
-
-
-
-
-
-
- urlnormalizer.order
- org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer
- Order in which normalizers will run. If any of these isn't
- activated it will be silently skipped. If other normalizers not on the
- list are activated, they will run in random order after the ones
- specified here are run.
-
-
-
-
- urlnormalizer.regex.file
- regex-normalize.xml
- Name of the config file used by the RegexUrlNormalizer class.
-
-
-
-
- urlnormalizer.loop.count
- 1
- Optionally loop through normalizers several times, to make
- sure that all transformations have been performed.
-
-
-
-
-
-
-
-
- mime.type.magic
- true
- Defines if the mime content type detector uses magic resolution.
-
-
-
-
-
-
- plugin.folders
- plugins
- Directories where nutch plugins are located. Each
- element may be a relative or absolute path. If absolute, it is used
- as is. If relative, it is searched for on the classpath.
-
-
-
- plugin.auto-activation
- true
- Defines if some plugins that are not activated regarding
- the plugin.includes and plugin.excludes properties must be automaticaly
- activated if they are needed by some actived plugins.
-
-
-
-
- plugin.includes
- protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic
- Regular expression naming plugin directory names to
- include. Any plugin not matching this expression is excluded.
- In any case you need at least include the nutch-extensionpoints plugin. By
- default Nutch includes crawling just HTML and plain text via HTTP,
- and basic indexing and search plugins. In order to use HTTPS please enable
- protocol-httpclient, but be aware of possible intermittent problems with the
- underlying commons-httpclient library.
-
-
-
-
- plugin.excludes
-
- Regular expression naming plugin directory names to exclude.
-
-
-
-
-
-
- parse.plugin.file
- parse-plugins.xml
- The name of the file that defines the associations between
- content-types and parsers.
-
-
-
- parser.character.encoding.default
- windows-1252
- The character encoding to fall back to when no other information
- is available
-
-
-
- encodingdetector.charset.min.confidence
- -1
- A integer between 0-100 indicating minimum confidence value
- for charset auto-detection. Any negative value disables auto-detection.
-
-
-
-
- parser.caching.forbidden.policy
- content
- If a site (or a page) requests through its robot metatags
- that it should not be shown as cached content, apply this policy. Currently
- three keywords are recognized: "none" ignores any "noarchive" directives.
- "content" doesn't show the content, but shows summaries (snippets).
- "all" doesn't show either content or summaries.
-
-
-
-
- parser.html.impl
- neko
- HTML Parser implementation. Currently the following keywords
- are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
-
-
-
-
- parser.html.form.use_action
- false
- If true, HTML parser will collect URLs from form action
- attributes. This may lead to undesirable behavior (submitting empty
- forms during next fetch cycle). If false, form action attribute will
- be ignored.
-
-
-
- parser.html.outlinks.ignore_tags
-
- Comma separated list of HTML tags, from which outlinks
- shouldn't be extracted. Nutch takes links from: a, area, form, frame,
- iframe, script, link, img. If you add any of those tags here, it
- won't be taken. Default is empty list. Probably reasonable value
- for most people would be "img,script,link".
-
-
-
- htmlparsefilter.order
-
- The order by which HTMLParse filters are applied.
- If empty, all available HTMLParse filters (as dictated by properties
- plugin-includes and plugin-excludes above) are loaded and applied in system
- defined order. If not empty, only named filters are loaded and applied
- in given order.
- HTMLParse filter ordering MAY have an impact
- on end result, as some filters could rely on the metadata generated by a previous filter.
-
-
-
-
- parser.timeout
- 30
- Timeout in seconds for the parsing of a document, otherwise treats it as an exception and
- moves on the the following documents. This parameter is applied to any Parser implementation.
- Set to -1 to deactivate, bearing in mind that this could cause
- the parsing to crash because of a very long or corrupted document.
-
-
-
-
- parser.skip.truncated
- true
- Boolean value for whether we should skip parsing for truncated documents. By default this
- property is activated due to extremely high levels of CPU which parsing can sometimes take.
-
-
-
-
-
-
- urlfilter.domain.file
- domain-urlfilter.txt
- Name of file on CLASSPATH containing either top level domains or
- hostnames used by urlfilter-domain (DomainURLFilter) plugin.
-
-
-
- urlfilter.regex.file
- regex-urlfilter.txt
- Name of file on CLASSPATH containing regular expressions
- used by urlfilter-regex (RegexURLFilter) plugin.
-
-
-
- urlfilter.automaton.file
- automaton-urlfilter.txt
- Name of file on CLASSPATH containing regular expressions
- used by urlfilter-automaton (AutomatonURLFilter) plugin.
-
-
-
- urlfilter.prefix.file
- prefix-urlfilter.txt
- Name of file on CLASSPATH containing url prefixes
- used by urlfilter-prefix (PrefixURLFilter) plugin.
-
-
-
- urlfilter.suffix.file
- suffix-urlfilter.txt
- Name of file on CLASSPATH containing url suffixes
- used by urlfilter-suffix (SuffixURLFilter) plugin.
-
-
-
- urlfilter.order
-
- The order by which url filters are applied.
- If empty, all available url filters (as dictated by properties
- plugin-includes and plugin-excludes above) are loaded and applied in system
- defined order. If not empty, only named filters are loaded and applied
- in given order. For example, if this property has value:
- org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter
- then RegexURLFilter is applied first, and PrefixURLFilter second.
- Since all filters are AND'ed, filter ordering does not have impact
- on end result, but it may have performance implication, depending
- on relative expensiveness of filters.
-
-
-
-
-
-
- scoring.filter.order
-
- The order in which scoring filters are applied.
- This may be left empty (in which case all available scoring
- filters will be applied in the order defined in plugin-includes
- and plugin-excludes), or a space separated list of implementation
- classes.
-
-
-
-
-
-
- lang.ngram.min.length
- 1
- The minimum size of ngrams to uses to identify
- language (must be between 1 and lang.ngram.max.length).
- The larger is the range between lang.ngram.min.length and
- lang.ngram.max.length, the better is the identification, but
- the slowest it is.
-
-
-
-
- lang.ngram.max.length
- 4
- The maximum size of ngrams to uses to identify
- language (must be between lang.ngram.min.length and 4).
- The larger is the range between lang.ngram.min.length and
- lang.ngram.max.length, the better is the identification, but
- the slowest it is.
-
-
-
-
- lang.analyze.max.length
- 2048
- The maximum bytes of data to uses to indentify
- the language (0 means full content analysis).
- The larger is this value, the better is the analysis, but the
- slowest it is.
-
-
-
-
- lang.extraction.policy
- detect,identify
- This determines when the plugin uses detection and
- statistical identification mechanisms. The order in which the
- detect and identify are written will determine the extraction
- policy. Default case (detect,identify) means the plugin will
- first try to extract language info from page headers and metadata,
- if this is not successful it will try using tika language
- identification. Possible values are:
- detect
- identify
- detect,identify
- identify,detect
-
-
-
-
- lang.identification.only.certain
- false
- If set to true with lang.extraction.policy containing identify,
- the language code returned by Tika will be assigned to the document ONLY
- if it is deemed certain by Tika.
-
-
-
-
-
-
- hadoop.job.history.user.location
- ${hadoop.log.dir}/history/user
- Hadoop 0.17.x comes with a default setting to create
- user logs inside the output path of the job. This breaks some
- Hadoop classes, which expect the output to contain only
- part-XXXXX files. This setting changes the output to a
- subdirectory of the regular log directory.
-
-
-
-
-
-
- solr.mapping.file
- solrindex-mapping.xml
-
- Defines the name of the file that will be used in the mapping of internal
- nutch field names to solr index fields as specified in the target Solr schema.
-
-
-
-
- solr.commit.size
- 250
-
- Defines the number of documents to send to Solr in a single update batch.
- Decrease when handling very large documents to prevent Nutch from running
- out of memory. NOTE: It does not explicitly trigger a server side commit.
-
-
-
-
- solr.commit.index
- true
-
- When closing the indexer, trigger a commit to the Solr server.
-
-
-
-
- solr.auth
- false
-
- Whether to enable HTTP basic authentication for communicating with Solr.
- Use the solr.auth.username and solr.auth.password properties to configure
- your credentials.
-
-
-
-
-
-
- elastic.index
- index
-
- The name of the elasticsearch index. Will normally be autocreated if it
- doesn't exist.
-
-
-
-
- elastic.max.bulk.docs
- 500
-
- The number of docs in the batch that will trigger a flush to elasticsearch.
-
-
-
-
- elastic.max.bulk.size
- 5001001
-
- The total length of all indexed text in a batch that will trigger a flush to
- elasticsearch, by checking after every document for excess of this amount.
-
-
-
-
-
-
- storage.data.store.class
- org.apache.gora.memory.store.MemStore
- The Gora DataStore class for storing and retrieving data.
- Currently the following stores are available:
-
- org.apache.gora.sql.store.SqlStore
- Default store. A DataStore implementation for RDBMS with a SQL interface.
- SqlStore uses JDBC drivers to communicate with the DB. As explained in
- ivy.xml, currently >= gora-core 0.3 is not backwards compatable with
- SqlStore.
-
- org.apache.gora.cassandra.store.CassandraStore
- Gora class for storing data in Apache Cassandra.
-
- org.apache.gora.hbase.store.HBaseStore
- Gora class for storing data in Apache HBase.
-
- org.apache.gora.accumulo.store.AccumuloStore
- Gora class for storing data in Apache Accumulo.
-
- org.apache.gora.avro.store.AvroStore
- Gora class for storing data in Apache Avro.
-
- org.apache.gora.avro.store.DataFileAvroStore
- Gora class for storing data in Apache Avro. DataFileAvroStore is
- a file based store which uses Avro's DataFile{Writer,Reader}'s as a backend.
- This datastore supports mapreduce.
-
- org.apache.gora.memory.store.MemStore
- Gora class for storing data in a Memory based implementation for tests.
-
-
-
-
- storage.schema.webpage
- webpage
- This value holds the schema name used for Nutch web db.
- Note that Nutch ignores the value in the gora mapping files, and uses
- this as the webpage schema name.
-
-
-
-
- storage.schema.host
- host
- This value holds the schema name used for Nutch host db.
- Note that Nutch ignores the value in the gora mapping files, and uses
- this as the host schema name.
-
-
-
-
- storage.crawl.id
-
- This value helps differentiate between the datasets that
- the jobs in the crawl cycle generate and operate on. The value will
- be input to all the jobs which then will use it as a prefix when
- accessing to the schemas. The default configuration uses no id to prefix
- the schemas. The value could also be given as a command line argument
- to each job.
-
-
-
-
- gora.buffer.read.limit
- 10000
- The maximum number of buffered Records we wish to
- read in one batch. @see org.apache.gora.mapreduce.GoraRecordReader
-
-
-
-
- gora.buffer.write.limit
- 10000
- Configures (for the Hadoop record writer) the maximum number of
- buffered Records we wish to regularly flush to the Gora datastore.
- @see org.apache.gora.mapreduce.GoraRecordWriter.
-
-
+
+
+
+ store.ip.address
+ false
+ Enables us to capture the specific IP address
+ (InetSocketAddress) of the host which we connect to via
+ the given protocol.
+
+
+
+
+
+
+ file.content.limit
+ 65536
+ The length limit for downloaded content using the file
+ protocol, in bytes. If this value is nonnegative (>=0), content
+ longer
+ than it will be truncated; otherwise, no truncation at all. Do not
+ confuse this setting with the http.content.limit setting.
+
+
+
+
+ file.content.ignored
+ true
+ If true, no file content will be saved during fetch.
+ And it is probably what we want to set most of time, since file:// URLs
+ are meant to be local and we can always use them directly at parsing
+ and indexing stages. Otherwise file contents will be saved.
+ !! NO IMPLEMENTED YET !!
+
+
+
+
+ file.crawl.parent
+ true
+ The crawler is not restricted to the directories that you
+ specified in the
+ urls file but it is jumping into the parent directories as well. For your
+ own crawling you can
+ change this behavior (set to false) the way that only directories beneath
+ the directories that you specify get
+ crawled.
+
+
+
+
+
+
+
+ http.agent.name
+
+ HTTP 'User-Agent' request header. MUST NOT be empty -
+ please set this to a single word uniquely related to your
+ organization.
+
+ NOTE: You should also check other related properties:
+
+ http.robots.agents
+ http.agent.description
+ http.agent.url
+ http.agent.email
+ http.agent.version
+
+ and set their values appropriately.
+
+
+
+
+
+ http.robots.agents
+ *
+ The agent strings we'll look for in robots.txt files,
+ comma-separated, in decreasing order of precedence. You should
+ put the value of http.agent.name as the first agent name, and keep the
+ default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+
+
+
+
+ http.robots.403.allow
+ true
+ Some servers return HTTP status 403 (Forbidden) if
+ /robots.txt doesn't exist. This should probably mean that we are
+ allowed to crawl the site nonetheless. If this is set to false,
+ then such sites will be treated as forbidden.
+
+
+
+
+ http.agent.description
+
+ Further description of our bot- this text is used in
+ the User-Agent header. It appears in parenthesis after the agent name.
+
+
+
+
+ http.agent.url
+
+ A URL to advertise in the User-Agent header. This will
+ appear in parenthesis after the agent name. Custom dictates that this
+ should be a URL of a page explaining the purpose and behavior of this
+ crawler.
+
+
+
+
+ http.agent.email
+
+ An email address to advertise in the HTTP 'From' request
+ header and User-Agent header. A good practice is to mangle this
+ address (e.g. 'info at example dot com') to avoid spamming.
+
+
+
+
+ http.agent.version
+ Nutch-2.3-SNAPSHOT
+ A version string to advertise in the User-Agent
+ header.
+
+
+
+
+ http.agent.host
+
+ Name or IP address of the host on which the Nutch crawler
+ would be running. Currently this is used by 'protocol-httpclient'
+ plugin.
+
+
+
+
+ http.timeout
+ 10000
+ The default network timeout, in milliseconds.
+
+
+
+
+ http.max.delays
+ 100
+ The number of times a thread will delay when trying to
+ fetch a page. Each time it finds that a host is busy, it will wait
+ fetcher.server.delay. After http.max.delays attepts, it will give
+ up on the page for now.
+
+
+
+
+ http.content.limit
+ 65536
+ The length limit for downloaded content using the http
+ protocol, in bytes. If this value is nonnegative (>=0), content
+ longer
+ than it will be truncated; otherwise, no truncation at all. Do not
+ confuse this setting with the file.content.limit setting.
+
+
+
+
+ http.proxy.host
+
+ The proxy hostname. If empty, no proxy is used.
+
+
+
+
+ http.proxy.port
+
+ The proxy port.
+
+
+
+ http.proxy.username
+
+ Username for proxy. This will be used by
+ 'protocol-httpclient', if the proxy server requests basic, digest
+ and/or NTLM authentication. To use this, 'protocol-httpclient' must
+ be present in the value of 'plugin.includes' property.
+ NOTE: For NTLM authentication, do not prefix the username with the
+ domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
+
+
+
+
+ http.proxy.password
+
+ Password for proxy. This will be used by
+ 'protocol-httpclient', if the proxy server requests basic, digest
+ and/or NTLM authentication. To use this, 'protocol-httpclient' must
+ be present in the value of 'plugin.includes' property.
+
+
+
+
+ http.proxy.realm
+
+ Authentication realm for proxy. Do not define a value
+ if realm is not required or authentication should take place for any
+ realm. NTLM does not use the notion of realms. Specify the domain
+ name
+ of NTLM authentication as the value for this property. To use this,
+ 'protocol-httpclient' must be present in the value of
+ 'plugin.includes' property.
+
+
+
+
+ http.auth.file
+ httpclient-auth.xml
+ Authentication configuration file for
+ 'protocol-httpclient' plugin.
+
+
+
+
+ http.verbose
+ false
+ If true, HTTP will log more verbosely.
+
+
+
+ http.useHttp11
+ false
+ NOTE: at the moment this works only for
+ protocol-httpclient.
+ If true, use HTTP 1.1, if false use HTTP 1.0 .
+
+
+
+
+ http.accept.language
+ en-us,en-gb,en;q=0.7,*;q=0.3
+ Value of the "Accept-Language" request header field.
+ This allows selecting non-English language as default one to retrieve.
+ It is a useful setting for search engines build for certain national
+ group.
+
+
+
+
+ http.accept
+ text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
+
+ Value of the "Accept" request header field.
+
+
+
+
+ http.store.responsetime
+ true
+ Enables us to record the response time of the
+ host which is the time period between start connection to end
+ connection of a pages host.
+
+
+
+
+
+
+ ftp.username
+ anonymous
+ ftp login username.
+
+
+
+ ftp.password
+ anonymous@example.com
+ ftp login password.
+
+
+
+ ftp.content.limit
+ 65536
+ The length limit for downloaded content, in bytes.
+ If this value is nonnegative (>=0), content longer than it will be
+ truncated;
+ otherwise, no truncation at all.
+ Caution: classical ftp RFCs never defines partial transfer and, in fact,
+ some ftp servers out there do not handle client side forced close-down
+ very
+ well. Our implementation tries its best to handle such situations
+ smoothly.
+
+
+
+
+ ftp.timeout
+ 60000
+ Default timeout for ftp client socket, in millisec.
+ Please also see ftp.keep.connection below.
+
+
+
+
+ ftp.server.timeout
+ 100000
+ An estimation of ftp server idle time, in millisec.
+ Typically it is 120000 millisec for many ftp servers out there.
+ Better be conservative here. Together with ftp.timeout, it is used to
+ decide if we need to delete (annihilate) current ftp.client instance
+ and
+ force to start another ftp.client instance anew. This is necessary because
+ a fetcher thread may not be able to obtain next request from queue in
+ time
+ (due to idleness) before our ftp client times out or remote server
+ disconnects. Used only when ftp.keep.connection is true (please see
+ below).
+
+
+
+
+ ftp.keep.connection
+ false
+ Whether to keep ftp connection. Useful if crawling same
+ host
+ again and again. When set to true, it avoids connection, login and dir
+ list
+ parser setup for subsequent urls. If it is set to true, however, you must
+ make sure (roughly):
+ (1) ftp.timeout is less than ftp.server.timeout
+ (2) ftp.timeout is larger than (fetcher.threads.fetch *
+ fetcher.server.delay)
+ Otherwise there will be too many "delete client because idled too long"
+ messages in thread logs.
+
+
+
+
+ ftp.follow.talk
+ false
+ Whether to log dialogue between our client and remote
+ server. Useful for debugging.
+
+
+
+
+
+
+ db.fetch.interval.default
+ 2592000
+ The default number of seconds between re-fetches of a
+ page (30 days).
+
+
+
+
+ db.fetch.interval.max
+ 7776000
+ The maximum number of seconds between re-fetches of a
+ page
+ (90 days). After this period every page in the db will be re-tried, no
+ matter what is its status.
+
+
+
+
+ db.fetch.schedule.class
+ org.apache.nutch.crawl.DefaultFetchSchedule
+ The implementation of fetch schedule.
+ DefaultFetchSchedule simply
+ adds the original fetchInterval to the last fetch time, regardless of
+ page changes.
+
+
+
+
+ db.fetch.schedule.adaptive.inc_rate
+ 0.4
+ If a page is unmodified, its fetchInterval will be
+ increased by this rate. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.
+
+
+
+
+ db.fetch.schedule.adaptive.dec_rate
+ 0.2
+ If a page is modified, its fetchInterval will be
+ decreased by this rate. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.
+
+
+
+
+ db.fetch.schedule.adaptive.min_interval
+ 60.0
+ Minimum fetchInterval, in seconds.
+
+
+
+ db.fetch.schedule.adaptive.max_interval
+ 31536000.0
+ Maximum fetchInterval, in seconds (365 days).
+ NOTE: this is limited by db.fetch.interval.max. Pages with
+ fetchInterval larger than db.fetch.interval.max
+ will be fetched anyway.
+
+
+
+
+ db.fetch.schedule.adaptive.sync_delta
+ true
+ If true, try to synchronize with the time of page change.
+ by shifting the next fetchTime by a fraction (sync_rate) of the
+ difference
+ between the last modification time, and the last fetch time.
+
+
+
+
+ db.fetch.schedule.adaptive.sync_delta_rate
+ 0.3
+ See sync_delta for description. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.
+
+
+
+
+ db.update.additions.allowed
+ true
+ If true, updatedb will add newly discovered URLs, if
+ false
+ only already existing URLs in the CrawlDb will be updated and no new
+ URLs will be added.
+
+
+
+
+ db.update.max.inlinks
+ 10000
+ Maximum number of inlinks to take into account when
+ updating
+ a URL score in the crawlDB. Only the best scoring inlinks are kept.
+
+
+
+
+ db.ignore.internal.links
+ true
+ If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping only the highest quality
+ links.
+
+
+
+
+ db.ignore.external.links
+ false
+ If true, outlinks leading from a page to external hosts
+ will be ignored. This is an effective way to limit the crawl to
+ include
+ only initially injected hosts, without creating complex URLFilters.
+
+
+
+
+ db.score.injected
+ 1.0
+ The score of new pages added by the injector.
+
+
+
+
+ db.score.link.external
+ 1.0
+ The score factor for new pages added due to a link from
+ another host relative to the referencing page's score. Scoring
+ plugins
+ may use this value to affect initial scores of external links.
+
+
+
+
+ db.score.link.internal
+ 1.0
+ The score factor for pages added due to a link from the
+ same host, relative to the referencing page's score. Scoring plugins
+ may use this value to affect initial scores of internal links.
+
+
+
+
+ db.score.count.filtered
+ false
+ The score value passed to newly discovered pages is
+ calculated as a fraction of the original page score divided by the
+ number of outlinks. If this option is false, only the outlinks that
+ passed
+ URLFilters will count, if it's true then all outlinks will count.
+
+
+
+
+ db.max.outlinks.per.page
+ 100
+ The maximum number of outlinks that we'll process for a
+ page.
+ If this value is nonnegative (>=0), at most db.max.outlinks.per.page
+ outlinks
+ will be processed for a page; otherwise, all outlinks will be processed.
+
+
+
+
+ db.max.anchor.length
+ 100
+ The maximum number of characters permitted in an anchor.
+
+
+
+
+ db.parsemeta.to.crawldb
+
+ Comma-separated list of parse metadata keys to transfer
+ to the crawldb (NUTCH-779).
+ Assuming for instance that the languageidentifier plugin is enabled, setting
+ the value to 'lang'
+ will copy both the key 'lang' and its value to the corresponding entry in
+ the crawldb.
+
+
+
+
+ db.fetch.retry.max
+ 3
+ The maximum number of times a url that has encountered
+ recoverable errors is generated for fetch.
+
+
+
+
+ db.signature.class
+ org.apache.nutch.crawl.MD5Signature
+ The default implementation of a page signature.
+ Signatures
+ created with this implementation will be used for duplicate detection
+ and removal.
+
+
+
+
+ db.signature.text_profile.min_token_len
+ 2
+ Minimum token length to be included in the signature.
+
+
+
+
+ db.signature.text_profile.quant_rate
+ 0.01
+ Profile frequencies will be rounded down to a multiple of
+ QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
+ frequency. If maxFreq > 1 then QUANT will be at least 2, which means
+ that
+ for longer texts tokens with frequency 1 will always be discarded.
+
+
+
+
+
+
+ generate.max.count
+ -1
+ The maximum number of urls in a single
+ fetchlist. -1 if unlimited. The urls are counted according
+ to the value of the parameter generator.count.mode.
+
+
+
+
+ generate.max.distance
+ -1
+ The maximum distance of an URL that the generator is
+ allowed
+ to select for fetch. The distance is the smallest number of nodes
+ (shortest path)
+ of an URL from the original injected URL. (Injected URLs have distance
+ 0).
+
+
+
+
+ generate.count.mode
+ host
+ Determines how the URLs are counted for
+ generator.max.count.
+ Default value is 'host' but can be 'domain'. Note that we do not count
+ per IP in the new version of the Generator.
+
+
+
+
+ generate.update.crawldb
+ false
+ For highly-concurrent environments, where several
+ generate/fetch/update cycles may overlap, setting this to true
+ ensures
+ that generate will create different fetchlists even without intervening
+ updatedb-s, at the cost of running an additional job to update
+ CrawlDB.
+ If false, running generate twice without intervening
+ updatedb will generate identical fetchlists.
+
+
+
+
+
+ partition.url.mode
+ byHost
+ Determines how to partition URLs. Default value is
+ 'byHost',
+ also takes 'byDomain' or 'byIP'.
+
+
+
+
+ crawl.gen.delay
+ 604800000
+
+ This value, expressed in days, defines how long we should keep the lock
+ on records
+ in CrawlDb that were just selected for fetching. If these records are
+ not updated
+ in the meantime, the lock is canceled, i.e. the become eligible for
+ selecting.
+ Default value of this is 7 days.
+
+
+
+
+
+
+ fetcher.server.delay
+ 5.0
+ The number of seconds the fetcher will delay between
+ successive requests to the same server.
+
+
+
+
+ fetcher.server.min.delay
+ 0.0
+ The minimum number of seconds the fetcher will delay
+ between
+ successive requests to the same server. This value is applicable ONLY
+ if fetcher.threads.per.host is greater than 1 (i.e. the host blocking
+ is turned off).
+
+
+
+
+ fetcher.max.crawl.delay
+ 30
+
+ If the Crawl-Delay in robots.txt is set to greater than this value (in
+ seconds) then the fetcher will skip this page, generating an error
+ report.
+ If set to -1 the fetcher will never skip such pages and will wait the
+ amount of time retrieved from robots.txt Crawl-Delay, however long
+ that
+ might be.
+
+
+
+
+ fetcher.threads.fetch
+ 10
+ The number of FetcherThreads the fetcher should use.
+ This is also determines the maximum number of requests that are
+ made at once (each FetcherThread handles one connection). The total
+ number of threads running in distributed mode will be the number of
+ fetcher threads * number of nodes as fetcher has one map task per
+ node.
+
+
+
+
+ fetcher.threads.per.queue
+ 1
+ This number is the maximum number of threads that
+ should be allowed to access a queue at one time.
+
+
+
+
+ fetcher.queue.mode
+ byHost
+ Determines how the URLs are placed into queues.
+ Allowed values are 'byHost', 'byDomain' and 'byIP'.
+ The value would usually correspond to that of 'partition.url.mode'.
+
+
+
+
+ fetcher.queue.use.host.settings
+ false
+ Allows us to optionally enable host specific queue
+ behavior if present.
+
+
+
+
+ fetcher.verbose
+ false
+ If true, fetcher will log more verbosely.
+
+
+
+ fetcher.parse
+ false
+ If true, fetcher will parse content. NOTE: previous
+ releases would
+ default to true. Since 2.0 this is set to false as a safer default.
+
+
+
+
+ fetcher.store.content
+ true
+ If true, fetcher will store content.
+
+
+
+ fetcher.timelimit.mins
+ -1
+ This is the number of minutes allocated to the fetching.
+ Once this value is reached, any remaining entry from the input URL
+ list is skipped
+ and all active queues are emptied. The default value of -1 deactivates
+ the time limit.
+
+
+
+
+ fetcher.max.exceptions.per.queue
+ -1
+ The maximum number of protocol-level exceptions (e.g.
+ timeouts) per
+ host (or IP) queue. Once this value is reached, any remaining entries
+ from this
+ queue are purged, effectively stopping the fetching from this host/IP. The
+ default
+ value of -1 deactivates this limit.
+
+
+
+
+ fetcher.throughput.threshold.pages
+ -1
+ The threshold of minimum pages per second. If the fetcher
+ downloads less
+ pages per second than the configured threshold, the fetcher stops,
+ preventing slow queue's
+ from stalling the throughput. This threshold must be an integer. This can
+ be useful when
+ fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
+
+
+
+
+ fetcher.throughput.threshold.sequence
+ 5
+ The number of times the fetcher.throughput.threshold is
+ allowed to be exceeded,
+ in a row. This setting prevents accidental slow downs from stopping the
+ fetcher.
+
+
+
+
+ fetcher.throughput.threshold.check.after
+ 5
+ The number of minutes after which the throughput check is
+ enabled.
+
+
+
+ fetcher.queue.depth.multiplier
+ 50
+ (EXPERT)The fetcher buffers the incoming URLs into queues
+ based on the [host|domain|IP]
+ (see param fetcher.queue.mode). The depth of the queue is the number of
+ threads times the value of this parameter.
+ A large value requires more memory but can improve the performance of
+ the fetch when the order of the URLS in the fetch list
+ is not optimal.
+
+
+
+
+
+
+ indexingfilter.order
+
+ The order by which index filters are applied.
+ If empty, all available index filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in
+ system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order. For example, if this property has value:
+ org.apache.nutch.indexer.basic.BasicIndexingFilter
+ org.apache.nutch.indexer.more.MoreIndexingFilter
+ then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
+
+ Filter ordering might have impact on result if one filter depends on
+ output of
+ another filter.
+
+
+
+
+ indexer.score.power
+ 0.5
+
+ Used by the OPIC plugin. Determines the power of link analyis scores.
+ Each pages's boost is set to
+
+ score
+ scorePower
+
+ where
+ score
+ is its link analysis score and
+ scorePower
+ is the
+ value of this parameter. This is compiled into indexes, so, when
+ this is changed, pages must be re-indexed for it to take
+ effect.
+
+
+
+
+
+
+ indexer.max.title.length
+ 100
+ The maximum number of characters of a title that are
+ indexed. A value of -1 disables this check.
+ Used by index-basic.
+
+
+
+
+
+
+ moreIndexingFilter.indexMimeTypeParts
+ true
+ Determines whether the index-more plugin will split the
+ mime-type
+ in sub parts, this requires the type field to be multi valued. Set to
+ true for backward
+ compatibility. False will not split the mime-type.
+
+
+
+
+
+
+ anchorIndexingFilter.deduplicate
+ false
+ With this enabled the indexer will case-insensitive
+ deduplicate hanchors
+ before indexing. This prevents possible hundreds or thousands of identical
+ anchors for
+ a given page to be indexed but will affect the search scoring (i.e.
+ tf=1.0f).
+
+
+
+
+
+
+ urlnormalizer.order
+ org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer
+ org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer
+ Order in which normalizers will run. If any of these
+ isn't
+ activated it will be silently skipped. If other normalizers not on the
+ list are activated, they will run in random order after the ones
+ specified here are run.
+
+
+
+
+ urlnormalizer.regex.file
+ regex-normalize.xml
+ Name of the config file used by the RegexUrlNormalizer
+ class.
+
+
+
+
+ urlnormalizer.loop.count
+ 1
+ Optionally loop through normalizers several times, to
+ make
+ sure that all transformations have been performed.
+
+
+
+
+
+
+
+
+ mime.type.magic
+ true
+ Defines if the mime content type detector uses magic
+ resolution.
+
+
+
+
+
+
+ plugin.folders
+ plugins
+ Directories where nutch plugins are located. Each
+ element may be a relative or absolute path. If absolute, it is used
+ as is. If relative, it is searched for on the classpath.
+
+
+
+
+ plugin.auto-activation
+ true
+ Defines if some plugins that are not activated regarding
+ the plugin.includes and plugin.excludes properties must be
+ automaticaly
+ activated if they are needed by some actived plugins.
+
+
+
+
+ plugin.includes
+ protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|urlnormalizer-(pass|regex|basic)|scoring-opic
+
+ Regular expression naming plugin directory names to
+ include. Any plugin not matching this expression is excluded.
+ In any case you need at least include the nutch-extensionpoints plugin.
+ By
+ default Nutch includes crawling just HTML and plain text via HTTP,
+ and basic indexing and search plugins. In order to use HTTPS please
+ enable
+ protocol-httpclient, but be aware of possible intermittent problems with the
+ underlying commons-httpclient library.
+
+
+
+
+ plugin.excludes
+
+ Regular expression naming plugin directory names to
+ exclude.
+
+
+
+
+
+
+ parse.plugin.file
+ parse-plugins.xml
+ The name of the file that defines the associations
+ between
+ content-types and parsers.
+
+
+
+
+ parser.character.encoding.default
+ windows-1252
+ The character encoding to fall back to when no other
+ information
+ is available
+
+
+
+
+ encodingdetector.charset.min.confidence
+ -1
+ A integer between 0-100 indicating minimum confidence
+ value
+ for charset auto-detection. Any negative value disables auto-detection.
+
+
+
+
+ parser.caching.forbidden.policy
+ content
+ If a site (or a page) requests through its robot metatags
+ that it should not be shown as cached content, apply this policy.
+ Currently
+ three keywords are recognized: "none" ignores any "noarchive" directives.
+ "content" doesn't show the content, but shows summaries (snippets).
+ "all" doesn't show either content or summaries.
+
+
+
+
+
+ parser.html.impl
+ neko
+ HTML Parser implementation. Currently the following
+ keywords
+ are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
+
+
+
+
+ parser.html.form.use_action
+ false
+ If true, HTML parser will collect URLs from form action
+ attributes. This may lead to undesirable behavior (submitting empty
+ forms during next fetch cycle). If false, form action attribute will
+ be ignored.
+
+
+
+
+ parser.html.outlinks.ignore_tags
+
+ Comma separated list of HTML tags, from which outlinks
+ shouldn't be extracted. Nutch takes links from: a, area, form, frame,
+ iframe, script, link, img. If you add any of those tags here, it
+ won't be taken. Default is empty list. Probably reasonable value
+ for most people would be "img,script,link".
+
+
+
+
+ htmlparsefilter.order
+
+ The order by which HTMLParse filters are applied.
+ If empty, all available HTMLParse filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in
+ system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order.
+ HTMLParse filter ordering MAY have an impact
+ on end result, as some filters could rely on the metadata generated by
+ a previous filter.
+
+
+
+
+ parser.timeout
+ 30
+ Timeout in seconds for the parsing of a document,
+ otherwise treats it as an exception and
+ moves on the the following documents. This parameter is applied to any
+ Parser implementation.
+ Set to -1 to deactivate, bearing in mind that this could cause
+ the parsing to crash because of a very long or corrupted document.
+
+
+
+
+ parser.skip.truncated
+ true
+ Boolean value for whether we should skip parsing for
+ truncated documents. By default this
+ property is activated due to extremely high levels of CPU which parsing can
+ sometimes take.
+
+
+
+
+
+
+ urlfilter.domain.file
+ domain-urlfilter.txt
+ Name of file on CLASSPATH containing either top level
+ domains or
+ hostnames used by urlfilter-domain (DomainURLFilter) plugin.
+
+
+
+
+ urlfilter.regex.file
+ regex-urlfilter.txt
+ Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-regex (RegexURLFilter) plugin.
+
+
+
+
+ urlfilter.automaton.file
+ automaton-urlfilter.txt
+ Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-automaton (AutomatonURLFilter) plugin.
+
+
+
+
+ urlfilter.prefix.file
+ prefix-urlfilter.txt
+ Name of file on CLASSPATH containing url prefixes
+ used by urlfilter-prefix (PrefixURLFilter) plugin.
+
+
+
+
+ urlfilter.suffix.file
+ suffix-urlfilter.txt
+ Name of file on CLASSPATH containing url suffixes
+ used by urlfilter-suffix (SuffixURLFilter) plugin.
+
+
+
+
+ urlfilter.order
+
+ The order by which url filters are applied.
+ If empty, all available url filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in
+ system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order. For example, if this property has value:
+ org.apache.nutch.urlfilter.regex.RegexURLFilter
+ org.apache.nutch.urlfilter.prefix.PrefixURLFilter
+ then RegexURLFilter is applied first, and PrefixURLFilter second.
+ Since all filters are AND'ed, filter ordering does not have impact
+ on end result, but it may have performance implication, depending
+ on relative expensiveness of filters.
+
+
+
+
+
+
+ scoring.filter.order
+
+ The order in which scoring filters are applied.
+ This may be left empty (in which case all available scoring
+ filters will be applied in the order defined in plugin-includes
+ and plugin-excludes), or a space separated list of implementation
+ classes.
+
+
+
+
+
+
+ lang.ngram.min.length
+ 1
+ The minimum size of ngrams to uses to identify
+ language (must be between 1 and lang.ngram.max.length).
+ The larger is the range between lang.ngram.min.length and
+ lang.ngram.max.length, the better is the identification, but
+ the slowest it is.
+
+
+
+
+ lang.ngram.max.length
+ 4
+ The maximum size of ngrams to uses to identify
+ language (must be between lang.ngram.min.length and 4).
+ The larger is the range between lang.ngram.min.length and
+ lang.ngram.max.length, the better is the identification, but
+ the slowest it is.
+
+
+
+
+ lang.analyze.max.length
+ 2048
+ The maximum bytes of data to uses to indentify
+ the language (0 means full content analysis).
+ The larger is this value, the better is the analysis, but the
+ slowest it is.
+
+
+
+
+ lang.extraction.policy
+ detect,identify
+ This determines when the plugin uses detection and
+ statistical identification mechanisms. The order in which the
+ detect and identify are written will determine the extraction
+ policy. Default case (detect,identify) means the plugin will
+ first try to extract language info from page headers and metadata,
+ if this is not successful it will try using tika language
+ identification. Possible values are:
+ detect
+ identify
+ detect,identify
+ identify,detect
+
+
+
+
+ lang.identification.only.certain
+ false
+ If set to true with lang.extraction.policy containing
+ identify,
+ the language code returned by Tika will be assigned to the document ONLY
+ if it is deemed certain by Tika.
+
+
+
+
+
+
+ hadoop.job.history.user.location
+ ${hadoop.log.dir}/history/user
+ Hadoop 0.17.x comes with a default setting to create
+ user logs inside the output path of the job. This breaks some
+ Hadoop classes, which expect the output to contain only
+ part-XXXXX files. This setting changes the output to a
+ subdirectory of the regular log directory.
+
+
+
+
+
+
+ solr.mapping.file
+ solrindex-mapping.xml
+
+ Defines the name of the file that will be used in the mapping of internal
+ nutch field names to solr index fields as specified in the target
+ Solr schema.
+
+
+
+
+ solr.commit.size
+ 250
+
+ Defines the number of documents to send to Solr in a single update batch.
+ Decrease when handling very large documents to prevent Nutch from
+ running
+ out of memory. NOTE: It does not explicitly trigger a server side
+ commit.
+
+
+
+
+ solr.commit.index
+ true
+
+ When closing the indexer, trigger a commit to the Solr server.
+
+
+
+
+ solr.auth
+ false
+
+ Whether to enable HTTP basic authentication for communicating with Solr.
+ Use the solr.auth.username and solr.auth.password properties to
+ configure
+ your credentials.
+
+
+
+
+
+
+ elastic.host
+
+ The hostname to send documents to using TransportClient.
+ Either host
+ and port must be defined or cluster.
+
+
+
+
+ elastic.port
+ 9300
+ The port to connect to using TransportClient.
+
+
+
+
+
+ elastic.cluster
+
+ The cluster name to discover. Either host and potr must
+ be defined
+ or cluster.
+
+
+
+
+ elastic.index
+ nutch
+
+ The name of the elasticsearch index. Will normally be autocreated if it
+ doesn't exist.
+
+
+
+
+ elastic.max.bulk.docs
+ 250
+
+ The number of docs in the batch that will trigger a flush to
+ elasticsearch.
+
+
+
+
+ elastic.max.bulk.size
+ 2500500
+
+ The total length of all indexed text in a batch that will trigger a
+ flush to
+ elasticsearch, by checking after every document for excess of this amount.
+
+
+
+
+
+
+ storage.data.store.class
+ org.apache.gora.memory.store.MemStore
+ The Gora DataStore class for storing and retrieving data.
+ Currently the following stores are available:
+
+ org.apache.gora.sql.store.SqlStore
+ Default store. A DataStore implementation for RDBMS with a SQL interface.
+ SqlStore uses JDBC drivers to communicate with the DB. As explained
+ in
+ ivy.xml, currently >= gora-core 0.3 is not backwards compatable with
+ SqlStore.
+
+ org.apache.gora.cassandra.store.CassandraStore
+ Gora class for storing data in Apache Cassandra.
+
+ org.apache.gora.hbase.store.HBaseStore
+ Gora class for storing data in Apache HBase.
+
+ org.apache.gora.accumulo.store.AccumuloStore
+ Gora class for storing data in Apache Accumulo.
+
+ org.apache.gora.avro.store.AvroStore
+ Gora class for storing data in Apache Avro.
+
+ org.apache.gora.avro.store.DataFileAvroStore
+ Gora class for storing data in Apache Avro. DataFileAvroStore is
+ a file based store which uses Avro's DataFile{Writer,Reader}'s as a
+ backend.
+ This datastore supports mapreduce.
+
+ org.apache.gora.memory.store.MemStore
+ Gora class for storing data in a Memory based implementation for tests.
+
+
+
+
+ storage.schema.webpage
+ webpage
+ This value holds the schema name used for Nutch web db.
+ Note that Nutch ignores the value in the gora mapping files, and uses
+ this as the webpage schema name.
+
+
+
+
+ storage.schema.host
+ host
+ This value holds the schema name used for Nutch host db.
+ Note that Nutch ignores the value in the gora mapping files, and uses
+ this as the host schema name.
+
+
+
+
+ storage.crawl.id
+
+ This value helps differentiate between the datasets that
+ the jobs in the crawl cycle generate and operate on. The value will
+ be input to all the jobs which then will use it as a prefix when
+ accessing to the schemas. The default configuration uses no id to
+ prefix
+ the schemas. The value could also be given as a command line argument
+ to each job.
+
+
+
+
+ gora.buffer.read.limit
+ 10000
+ The maximum number of buffered Records we wish to
+ read in one batch. @see org.apache.gora.mapreduce.GoraRecordReader
+
+
+
+
+ gora.buffer.write.limit
+ 10000
+ Configures (for the Hadoop record writer) the maximum
+ number of
+ buffered Records we wish to regularly flush to the Gora datastore.
+ @see org.apache.gora.mapreduce.GoraRecordWriter.
+
+
diff --git src/plugin/build.xml src/plugin/build.xml
index 907c575..6394d7a 100755
--- src/plugin/build.xml
+++ src/plugin/build.xml
@@ -31,6 +31,7 @@
+
@@ -112,6 +113,7 @@
+
diff --git src/plugin/indexer-elastic/build.xml src/plugin/indexer-elastic/build.xml
new file mode 100644
index 0000000..9bd57b2
--- /dev/null
+++ src/plugin/indexer-elastic/build.xml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
diff --git src/plugin/indexer-elastic/ivy.xml src/plugin/indexer-elastic/ivy.xml
new file mode 100644
index 0000000..d3c81e4
--- /dev/null
+++ src/plugin/indexer-elastic/ivy.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/indexer-elastic/plugin.xml src/plugin/indexer-elastic/plugin.xml
new file mode 100644
index 0000000..46d2d34
--- /dev/null
+++ src/plugin/indexer-elastic/plugin.xml
@@ -0,0 +1,55 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
new file mode 100644
index 0000000..c313bf5
--- /dev/null
+++ src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.elastic;
+
+public interface ElasticConstants {
+ public static final String ELASTIC_PREFIX = "elastic.";
+
+ public static final String HOST = ELASTIC_PREFIX + "host";
+ public static final String PORT = ELASTIC_PREFIX + "port";
+ public static final String CLUSTER = ELASTIC_PREFIX + "cluster";
+ public static final String INDEX = ELASTIC_PREFIX + "index";
+ public static final String MAX_BULK_DOCS = ELASTIC_PREFIX + "max.bulk.docs";
+ public static final String MAX_BULK_LENGTH = ELASTIC_PREFIX + "max.bulk.size";
+}
diff --git src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
new file mode 100644
index 0000000..2b30556
--- /dev/null
+++ src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexwriter.elastic;
+
+import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+import java.io.BufferedReader;
+import java.io.IOException;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.IndexWriter;
+import org.elasticsearch.ElasticSearchException;
+import org.elasticsearch.action.ListenableActionFuture;
+import org.elasticsearch.action.bulk.BulkItemResponse;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.delete.DeleteRequest;
+import org.elasticsearch.action.delete.DeleteRequestBuilder;
+import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.ImmutableSettings.Builder;
+import org.elasticsearch.client.transport.TransportClient;
+import org.elasticsearch.common.transport.InetSocketTransportAddress;
+import org.elasticsearch.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ */
+public class ElasticIndexWriter implements IndexWriter {
+ public static Logger LOG = LoggerFactory.getLogger(ElasticIndexWriter.class);
+
+ private static final int DEFAULT_MAX_BULK_DOCS = 250;
+ private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
+
+ private Client client;
+ private Node node;
+ private String defaultIndex;
+
+ private Configuration config;
+
+ private BulkRequestBuilder bulk;
+ private ListenableActionFuture execute;
+ private int port = -1;
+ private String host = null;
+ private String clusterName = null;
+ private int maxBulkDocs;
+ private int maxBulkLength;
+ private long indexedDocs = 0;
+ private int bulkDocs = 0;
+ private int bulkLength = 0;
+ private boolean createNewBulk = false;
+
+ @Override
+ public void open(Configuration job) throws IOException {
+ clusterName = job.get(ElasticConstants.CLUSTER);
+ host = job.get(ElasticConstants.HOST);
+ port = job.getInt(ElasticConstants.PORT, -1);
+
+ Builder settingsBuilder = ImmutableSettings.settingsBuilder();
+
+ BufferedReader reader = new BufferedReader(job.getConfResourceAsReader("elasticsearch.conf"));
+ String line;
+ String parts[];
+
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line.trim();
+ parts = line.split("=");
+
+ if (parts.length == 2) {
+ settingsBuilder.put(parts[0].trim(), parts[1].trim());
+ }
+ }
+ }
+
+ // Set the cluster name and build the settings
+ Settings settings = settingsBuilder.put("cluster.name", clusterName).build();
+
+ // Prefer TransportClient
+ if (host != null && port > 1) {
+ client = new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(host, port));
+ } else if (clusterName != null) {
+ node = nodeBuilder().settings(settings).client(true).node();
+ client = node.client();
+ }
+
+ bulk = client.prepareBulk();
+ defaultIndex = job.get(ElasticConstants.INDEX, "nutch");
+ maxBulkDocs = job.getInt(
+ ElasticConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS);
+ maxBulkLength = job.getInt(
+ ElasticConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH);
+ }
+ @Override
+ public void write(NutchDocument doc) throws IOException {
+ String id = (String)doc.getFieldValue("url");
+ String type = doc.getDocumentMeta().get("type");
+ if (type == null) type = "doc";
+ IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id);
+
+ Map source = new HashMap();
+
+ // Loop through all fields of this doc
+ for (String fieldName : doc.getFieldNames()) {
+ if (doc.getFieldValues(fieldName).size() > 1) {
+ source.put(fieldName, doc.getFieldValue(fieldName));
+ // Loop through the values to keep track of the size of this document
+ for (Object value : doc.getFieldValues(fieldName)) {
+ bulkLength += value.toString().length();
+ }
+ } else {
+ source.put(fieldName, doc.getFieldValue(fieldName));
+ bulkLength += doc.getFieldValue(fieldName).toString().length();
+ }
+ }
+ request.setSource(source);
+
+ // Add this indexing request to a bulk request
+ bulk.add(request);
+ indexedDocs++;
+ bulkDocs++;
+
+ if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
+ LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = "
+ + bulkLength + ", total docs = " + indexedDocs
+ + ", last doc in bulk = '" + id + "']");
+ // Flush the bulk of indexing requests
+ createNewBulk = true;
+ commit();
+ }
+ }
+
+
+ @Override
+ public void delete(String key) throws IOException {
+ try{
+ DeleteRequestBuilder builder = client.prepareDelete();
+ builder.setIndex(defaultIndex);
+ builder.setType("doc");
+ builder.setId(key);
+ builder.execute().actionGet();
+ }catch(ElasticSearchException e)
+ {
+ throw makeIOException(e);
+ }
+ }
+
+ public static IOException makeIOException(ElasticSearchException e) {
+ final IOException ioe = new IOException();
+ ioe.initCause(e);
+ return ioe;
+ }
+
+ @Override
+ public void update(NutchDocument doc) throws IOException {
+ write(doc);
+ }
+
+ @Override
+ public void commit() throws IOException {
+ if (execute != null) {
+ // wait for previous to finish
+ long beforeWait = System.currentTimeMillis();
+ BulkResponse actionGet = execute.actionGet();
+ if (actionGet.hasFailures()) {
+ for (BulkItemResponse item : actionGet) {
+ if (item.isFailed()) {
+ throw new RuntimeException("First failure in bulk: "
+ + item.getFailureMessage());
+ }
+ }
+ }
+ long msWaited = System.currentTimeMillis() - beforeWait;
+ LOG.info("Previous took in ms " + actionGet.getTookInMillis()
+ + ", including wait " + msWaited);
+ execute = null;
+ }
+ if (bulk != null) {
+ if (bulkDocs > 0) {
+ // start a flush, note that this is an asynchronous call
+ execute = bulk.execute();
+ }
+ bulk = null;
+ }
+ if (createNewBulk) {
+ // Prepare a new bulk request
+ bulk = client.prepareBulk();
+ bulkDocs = 0;
+ bulkLength = 0;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ // Flush pending requests
+ LOG.info("Processing remaining requests [docs = " + bulkDocs
+ + ", length = " + bulkLength + ", total docs = " + indexedDocs + "]");
+ createNewBulk = false;
+ commit();
+ // flush one more time to finalize the last bulk
+ LOG.info("Processing to finalize last execute");
+ createNewBulk = false;
+ commit();
+
+ // Close
+ client.close();
+ if (node != null) {
+ node.close();
+ }
+ }
+
+ @Override
+ public String describe() {
+ StringBuffer sb = new StringBuffer("ElasticIndexWriter\n");
+ sb.append("\t").append(ElasticConstants.CLUSTER).append(" : elastic prefix cluster\n");
+ sb.append("\t").append(ElasticConstants.HOST).append(" : hostname\n");
+ sb.append("\t").append(ElasticConstants.PORT).append(" : port\n");
+ sb.append("\t").append(ElasticConstants.INDEX).append(" : elastic index command \n");
+ sb.append("\t").append(ElasticConstants.MAX_BULK_DOCS).append(" : elastic bulk index doc counts. (default 250) \n");
+ sb.append("\t").append(ElasticConstants.MAX_BULK_LENGTH).append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n");
+ return sb.toString();
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ config = conf;
+ String cluster = conf.get(ElasticConstants.CLUSTER);
+ if (cluster == null) {
+ String message = "Missing elastic.cluster. Should be set in nutch-site.xml ";
+ message+="\n"+describe();
+ LOG.error(message);
+ throw new RuntimeException(message);
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return config;
+ }
+}