Index: src/test/org/apache/nutch/util/TestURLUtil.java =================================================================== --- src/test/org/apache/nutch/util/TestURLUtil.java (revision 923970) +++ src/test/org/apache/nutch/util/TestURLUtil.java (working copy) @@ -213,4 +213,50 @@ assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true)); } + // from RFC3986 section 5.4.1 + private static String baseString = "http://a/b/c/d;p?q"; + private static String[][] targets = new String[][] { + // unknown protocol {"g:h" , "g:h"}, + {"g" , "http://a/b/c/g"}, + { "./g" , "http://a/b/c/g"}, + { "g/" , "http://a/b/c/g/"}, + { "/g" , "http://a/g"}, + { "//g" , "http://g"}, + { "?y" , "http://a/b/c/d;p?y"}, + { "g?y" , "http://a/b/c/g?y"}, + { "#s" , "http://a/b/c/d;p?q#s"}, + { "g#s" , "http://a/b/c/g#s"}, + { "g?y#s" , "http://a/b/c/g?y#s"}, + { ";x" , "http://a/b/c/;x"}, + { "g;x" , "http://a/b/c/g;x"}, + { "g;x?y#s" , "http://a/b/c/g;x?y#s"}, + { "" , "http://a/b/c/d;p?q"}, + { "." , "http://a/b/c/"}, + { "./" , "http://a/b/c/"}, + { ".." , "http://a/b/"}, + { "../" , "http://a/b/"}, + { "../g" , "http://a/b/g"}, + { "../.." , "http://a/"}, + { "../../" , "http://a/"}, + { "../../g" , "http://a/g"} + }; + + public void testResolveURL() throws Exception { + // test NUTCH-436 + URL u436 = new URL("http://a/b/c/d;p?q#f"); + assertEquals("http://a/b/c/d;p?q#f", u436.toString()); + URL abs = URLUtil.resolveURL(u436, "?y"); + assertEquals("http://a/b/c/d;p?y", abs.toString()); + // test NUTCH-566 + URL u566 = new URL("http://www.fleurie.org/entreprise.asp"); + abs = URLUtil.resolveURL(u566, "?id_entrep=111"); + assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString()); + URL base = new URL(baseString); + assertEquals("base url parsing", baseString, base.toString()); + for (int i = 0; i < targets.length; i++) { + URL u = URLUtil.resolveURL(base, targets[i][0]); + assertEquals(targets[i][1], targets[i][1], u.toString()); + } + } + } Index: src/java/org/apache/nutch/util/URLUtil.java =================================================================== --- src/java/org/apache/nutch/util/URLUtil.java (revision 923970) +++ src/java/org/apache/nutch/util/URLUtil.java (working copy) @@ -26,7 +26,107 @@ /** Utility class for URL analysis */ public class URLUtil { + + /** + * Resolve relative URL-s and fix a few java.net.URL errors + * in handling of URLs with embedded params and pure query + * targets. + * @param base base url + * @param target target url (may be relative) + * @return resolved absolute url. + * @throws MalformedURLException + */ + public static URL resolveURL(URL base, String target) + throws MalformedURLException { + /* this is probably not needed anymore - see NUTCH-797. + // handle params that are embedded into the base url - move them to target + // so URL class constructs the new url class properly + if (base.toString().indexOf(';') > 0) + return fixEmbeddedParams(base, target); + */ + + // handle the case that there is a target that is a pure query, + // for example + // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0 + // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by + // default + // URL constructs the base+target combo as + // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly + // dropping the Search.aspx target + // + // Browsers handle these just fine, they must have an exception similar to + // this + if (target.startsWith("?")) { + return fixPureQueryTargets(base, target); + } + return new URL(base, target); + } + + /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */ + static URL fixPureQueryTargets(URL base, String target) + throws MalformedURLException { + if (!target.startsWith("?")) return new URL(base, target); + + String basePath = base.getPath(); + String baseRightMost = ""; + int baseRightMostIdx = basePath.lastIndexOf("/"); + if (baseRightMostIdx != -1) { + baseRightMost = basePath.substring(baseRightMostIdx + 1); + } + + if (target.startsWith("?")) target = baseRightMost + target; + + return new URL(base, target); + } + + /** + * Handles cases where the url param information is encoded into the base url + * as opposed to the target. + *
+ * If the taget contains params (i.e. ';xxxx') information then the target + * params information is assumed to be correct and any base params information + * is ignored. If the base contains params information but the tareget does + * not, then the params information is moved to the target allowing it to be + * correctly determined by the java.net.URL class. + * + * @param base + * The base URL. + * @param target + * The target path from the base URL. + * + * @return URL A URL with the params information correctly encoded. + * + * @throws MalformedURLException + * If the url is not a well formed URL. + */ + private static URL fixEmbeddedParams(URL base, String target) + throws MalformedURLException { + + // the target contains params information or the base doesn't then no + // conversion necessary, return regular URL + if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + return new URL(base, target); + } + + // get the base url and it params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS >= 0) { + target = target.substring(0, startQS) + params + + target.substring(startQS); + } else { + target += params; + } + + return new URL(base, target); + } + private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})"); /** Returns the domain name of the url. The domain name of a url is Index: src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java =================================================================== --- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (revision 923970) +++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (working copy) @@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; +import org.apache.nutch.util.URLUtil; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -300,51 +301,6 @@ } /** - * Handles cases where the url param information is encoded into the base - * url as opposed to the target. - *
- * If the taget contains params (i.e. ';xxxx') information then the target
- * params information is assumed to be correct and any base params information
- * is ignored. If the base contains params information but the tareget does
- * not, then the params information is moved to the target allowing it to be
- * correctly determined by the java.net.URL class.
- *
- * @param base The base URL.
- * @param target The target path from the base URL.
- *
- * @return URL A URL with the params information correctly encoded.
- *
- * @throws MalformedURLException If the url is not a well formed URL.
- */
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return new URL(base, target);
- }
-
- /**
* This method finds all anchors below the supplied DOM
* node
, and creates appropriate {@link Outlink}
* records for each (relative to the supplied base
@@ -400,8 +356,7 @@
if (target != null && !noFollow && !post)
try {
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (revision 923970)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (working copy)
@@ -26,6 +26,7 @@
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.*;
@@ -316,33 +317,6 @@
*
* @throws MalformedURLException If the url is not a well formed URL.
*/
- private URL fixEmbeddedParams(URL base, String target)
- throws MalformedURLException{
-
- // the target contains params information or the base doesn't then no
- // conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
- return new URL(base, target);
- }
-
- // get the base url and it params information
- String baseURL = base.toString();
- int startParams = baseURL.indexOf(';');
- String params = baseURL.substring(startParams);
-
- // if the target has a query string then put the params information after
- // any path but before the query string, otherwise just append to the path
- int startQS = target.indexOf('?');
- if (startQS >= 0) {
- target = target.substring(0, startQS) + params +
- target.substring(startQS);
- }
- else {
- target += params;
- }
-
- return new URL(base, target);
- }
/**
* This method finds all anchors below the supplied DOM
@@ -400,8 +374,7 @@
if (target != null && !noFollow && !post)
try {
- URL url = (base.toString().indexOf(';') > 0) ?
- fixEmbeddedParams(base, target) : new URL(base, target);
+ URL url = URLUtil.resolveURL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim()));
} catch (MalformedURLException e) {