Index: src/test/org/apache/nutch/util/TestURLUtil.java =================================================================== --- src/test/org/apache/nutch/util/TestURLUtil.java (revision 1181747) +++ src/test/org/apache/nutch/util/TestURLUtil.java (working copy) @@ -245,16 +245,22 @@ // test NUTCH-436 URL u436 = new URL("http://a/b/c/d;p?q#f"); assertEquals("http://a/b/c/d;p?q#f", u436.toString()); - URL abs = URLUtil.resolveURL(u436, "?y"); + // use strict mode + URL abs = URLUtil.resolveURL(u436, "?y", false); assertEquals("http://a/b/c/d;p?y", abs.toString()); // test NUTCH-566 URL u566 = new URL("http://www.fleurie.org/entreprise.asp"); - abs = URLUtil.resolveURL(u566, "?id_entrep=111"); + abs = URLUtil.resolveURL(u566, "?id_entrep=111", false); assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111", abs.toString()); + // test NUTCH-1115, remove embedded params + URL u1115 = new URL("http://www.funkybabes.nl/;ROOOWAN/fotoboek"); + abs = URLUtil.resolveURL(u1115, "forumregels", true); + assertEquals("http://www.funkybabes.nl/forumregels", abs.toString()); URL base = new URL(baseString); assertEquals("base url parsing", baseString, base.toString()); + // resolve using strict RFC rules for (int i = 0; i < targets.length; i++) { - URL u = URLUtil.resolveURL(base, targets[i][0]); + URL u = URLUtil.resolveURL(base, targets[i][0], false); assertEquals(targets[i][1], targets[i][1], u.toString()); } } Index: src/java/org/apache/nutch/util/URLUtil.java =================================================================== --- src/java/org/apache/nutch/util/URLUtil.java (revision 1181747) +++ src/java/org/apache/nutch/util/URLUtil.java (working copy) @@ -33,19 +33,43 @@ * targets. * @param base base url * @param target target url (may be relative) + * @param removeBaseParams if true then ";param" sections from base url are + * removed, otherwise leave them and depend on a buggy {@link URL} behavior. * @return resolved absolute url. * @throws MalformedURLException */ - public static URL resolveURL(URL base, String target) + public static URL resolveURL(URL base, String target, boolean removeBaseParams) throws MalformedURLException { target = target.trim(); - /* this is probably not needed anymore - see NUTCH-797. - // handle params that are embedded into the base url - move them to target - // so URL class constructs the new url class properly - if (base.toString().indexOf(';') > 0) - return fixEmbeddedParams(base, target); - */ + // see NUTCH-1115 + if (removeBaseParams && base.toString().indexOf(';') > 0) { + // get the base url and its params information + String baseURL = base.toString(); + int startParams = baseURL.indexOf(';'); + String params = baseURL.substring(startParams); + String query = ""; + for (int i = 0; i < params.length(); i++) { + char c = params.charAt(i); + switch (c) { + case '?': + case '#': + case '/': + case '&': + query = params.substring(i); + params = params.substring(0, i); + break; + } + } + baseURL = baseURL.substring(0, startParams - 1) + query; + base = new URL(baseURL); + // if the target has a query string then put the params information after + // any path but before the query string, otherwise just append to the path + int startQS = target.indexOf('?'); + if (startQS == 0) { + return fixPureQueryTargets(base, target); + } + } // handle the case that there is a target that is a pure query, // for example @@ -68,67 +92,11 @@ /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */ static URL fixPureQueryTargets(URL base, String target) throws MalformedURLException { - if (!target.startsWith("?")) return new URL(base, target); - String basePath = base.getPath(); - String baseRightMost = ""; - int baseRightMostIdx = basePath.lastIndexOf("/"); - if (baseRightMostIdx != -1) { - baseRightMost = basePath.substring(baseRightMostIdx + 1); - } - - if (target.startsWith("?")) target = baseRightMost + target; - - return new URL(base, target); + // use the same method as Tika HtmlHandler + return new URL(base.getProtocol(), base.getHost(), base.getPort(), base.getPath() + target); } - /** - * Handles cases where the url param information is encoded into the base url - * as opposed to the target. - *
- * If the taget contains params (i.e. ';xxxx') information then the target - * params information is assumed to be correct and any base params information - * is ignored. If the base contains params information but the tareget does - * not, then the params information is moved to the target allowing it to be - * correctly determined by the java.net.URL class. - * - * @param base - * The base URL. - * @param target - * The target path from the base URL. - * - * @return URL A URL with the params information correctly encoded. - * - * @throws MalformedURLException - * If the url is not a well formed URL. - */ - private static URL fixEmbeddedParams(URL base, String target) - throws MalformedURLException { - - // the target contains params information or the base doesn't then no - // conversion necessary, return regular URL - if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { - return new URL(base, target); - } - - // get the base url and it params information - String baseURL = base.toString(); - int startParams = baseURL.indexOf(';'); - String params = baseURL.substring(startParams); - - // if the target has a query string then put the params information after - // any path but before the query string, otherwise just append to the path - int startQS = target.indexOf('?'); - if (startQS >= 0) { - target = target.substring(0, startQS) + params - + target.substring(startQS); - } else { - target += params; - } - - return new URL(base, target); - } - private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})"); /** Returns the domain name of the url. The domain name of a url is Index: src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java =================================================================== --- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (revision 1181747) +++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (working copy) @@ -39,6 +39,8 @@ * */ class DOMContentUtils { + + private boolean removeEmbeddedParams; private static class LinkParams { private String elName; @@ -88,6 +90,9 @@ if ( ! forceTags.contains(ignoreTags[i]) ) linkParams.remove(ignoreTags[i]); } + + // https://issues.apache.org/jira/browse/NUTCH-1115 + removeEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true); } /** @@ -356,7 +361,7 @@ if (target != null && !noFollow && !post) try { - URL url = URLUtil.resolveURL(base, target); + URL url = URLUtil.resolveURL(base, target, removeEmbeddedParams); outlinks.add(new Outlink(url.toString(), linkText.toString().trim())); } catch (MalformedURLException e) { Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java =================================================================== --- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (revision 1181747) +++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (working copy) @@ -231,6 +231,7 @@ private static void setup() { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); + conf.setBoolean("parser.fix.embeddedparams", false); utils = new DOMContentUtils(conf); DOMFragmentParser parser= new DOMFragmentParser(); for (int i= 0; i < testPages.length; i++) { Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java =================================================================== --- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (revision 1181747) +++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (working copy) @@ -39,6 +39,8 @@ * */ public class DOMContentUtils { + + private boolean removeEmbeddedParams; public static class LinkParams { public String elName; @@ -88,6 +90,9 @@ if ( ! forceTags.contains(ignoreTags[i]) ) linkParams.remove(ignoreTags[i]); } + + // https://issues.apache.org/jira/browse/NUTCH-1115 + removeEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true); } /** @@ -385,7 +390,7 @@ if (target != null && !noFollow && !post) try { - URL url = URLUtil.resolveURL(base, target); + URL url = URLUtil.resolveURL(base, target, removeEmbeddedParams); outlinks.add(new Outlink(url.toString(), linkText.toString().trim())); } catch (MalformedURLException e) {