diff --git build.xml build.xml
index f0c752b..733de6d 100644
--- build.xml
+++ build.xml
@@ -161,6 +161,8 @@
This package contains classes for domain analysis.
+Apache Nutch is an open source web-search software project.
+Apache Nutch is a highly extensible and scalable open source web crawler software project.
Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.
diff --git src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java new file mode 100644 index 0000000..f72340a --- /dev/null +++ src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to index meta data from RSS feeds. + */ +package org.apache.nutch.indexer.feed; diff --git src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java new file mode 100644 index 0000000..a9b3603 --- /dev/null +++ src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse RSS feeds. + */ +package org.apache.nutch.parse.feed; diff --git src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java new file mode 100644 index 0000000..4c073dc --- /dev/null +++ src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree. + */ +package org.apache.nutch.parse.headings; diff --git src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html index 524d17e..3fae405 100644 --- src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html +++ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html @@ -1,5 +1,5 @@ -A basic indexing plugin.
+A basic indexing plugin, adds basic fields: url, host, title, content, etc.
diff --git src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java new file mode 100644 index 0000000..d177001 --- /dev/null +++ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Indexing filter to add document metadata to the index. + * Metadata may come from CrawlDb, parse or content metadata. + */ +package org.apache.nutch.indexer.metadata; diff --git src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html index 9ae5f6d..7b8fade 100644 --- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html +++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html @@ -1,5 +1,6 @@ -A more indexing plugin.
+A more indexing plugin, adds "more" index fields: +last modified date, MIME type, content length.
diff --git src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java new file mode 100644 index 0000000..181b186 --- /dev/null +++ src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Index writer plugin for debugging, writes pairs of <action, url> to a + * text file, action is one of "add", "update", or "delete". + */ +package org.apache.nutch.indexwriter.dummy; diff --git src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java new file mode 100644 index 0000000..c86c2f1 --- /dev/null +++ src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Index writer plugin for Elasticsearch. + */ +package org.apache.nutch.indexwriter.elastic; diff --git src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java new file mode 100644 index 0000000..2cebf7c --- /dev/null +++ src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Index writer plugin for Apache Solr. + */ +package org.apache.nutch.indexwriter.solr; diff --git src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java new file mode 100644 index 0000000..04cf2d2 --- /dev/null +++ src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse wrapper to run external command to do the parsing. + */ +package org.apache.nutch.parse.ext; diff --git src/plugin/parse-js/src/java/org/apache/nutch/package.html src/plugin/parse-js/src/java/org/apache/nutch/package.html deleted file mode 100644 index 7dffe62..0000000 --- src/plugin/parse-js/src/java/org/apache/nutch/package.html +++ /dev/null @@ -1,6 +0,0 @@ - - -A parser plugin and content filter to extract all (possible) links -from JavaScript files and code snippets.
- - diff --git src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java new file mode 100644 index 0000000..3f5ce52 --- /dev/null +++ src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parser and parse filter plugin to extract all (possible) links + * from JavaScript files and embedded JavaScript code snippets. + */ +package org.apache.nutch.parse.js; diff --git src/plugin/parse-metatags/plugin.xml src/plugin/parse-metatags/plugin.xml index b651622..07933fa 100644 --- src/plugin/parse-metatags/plugin.xml +++ src/plugin/parse-metatags/plugin.xml @@ -15,7 +15,7 @@ name="MetaTags Parser" point="org.apache.nutch.parse.HtmlParseFilter">-A url filter plugin based on +URL filter plugin based on dk.brics.automaton Finite-State Automata for JavaTM.
diff --git src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java new file mode 100644 index 0000000..d2eba1f --- /dev/null +++ src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to include only URLs which match an element in a given list of + * domain suffixes, domain names, and/or host names. + * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart + * (exclude URLs by host or domain). + */ +package org.apache.nutch.urlfilter.domain; + diff --git src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html deleted file mode 100644 index d427754..0000000 --- src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package.html +++ /dev/null @@ -1,5 +0,0 @@ - - -A url filter plugin that filters by domain.
- - diff --git src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java new file mode 100644 index 0000000..1f0022c --- /dev/null +++ src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names. + * See {@link org.apache.nutch.urlfilter.domain} for the counterpart (include only URLs + * matching host or domain). + */ +package org.apache.nutch.urlfilter.domainblacklist; + diff --git src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html index ca336b5..dbed0be 100644 --- src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html +++ src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html @@ -1,5 +1,5 @@ -A url filter plugin.
+URL filter plugin to include only URLs which match one of a given list of URL prefixes.
diff --git src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html index ca336b5..7acf73b 100644 --- src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html +++ src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html @@ -1,5 +1,5 @@ -A url filter plugin.
+URL filter plugin to include and/or exclude URLs matching Java regular expressions.
diff --git src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/package-info.java src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/package-info.java new file mode 100644 index 0000000..0449acc --- /dev/null +++ src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin to either exclude or include only URLs which match + * one of the given (path) suffixes. + */ +package org.apache.nutch.urlfilter.suffix; + diff --git src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html index 845696f..b5ec8a1 100644 --- src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html +++ src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html @@ -1,6 +1,6 @@ -A url filter plugin that validates given urls.
+URL filter plugin that validates given urls.
This plugin runs a series of tests for the given url to make sure that given url is valid and 'fetchable'.
Note: This plugin should only be used for web-related protocols such diff --git src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 8d151bd..09f1ebe 100644 --- src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -31,7 +31,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.oro.text.regex.*; -/** Converts URLs to a normal form . */ +/** + * Converts URLs to a normal form: + *
/./
or /../
http://