Index: src/java/org/apache/nutch/parse/OutlinkExtractor.java
===================================================================
--- src/java/org/apache/nutch/parse/OutlinkExtractor.java	(revision 1148269)
+++ src/java/org/apache/nutch/parse/OutlinkExtractor.java	(working copy)
@@ -25,13 +25,7 @@
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
+import java.util.regex.*;
 
 /**
  * Extractor to extract {@link org.apache.nutch.parse.Outlink}s 
@@ -60,6 +54,8 @@
   private static final String URL_PATTERN = 
     "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
 
+  private static Pattern pattern;
+
   /**
    * Extracts <code>Outlink</code> from given plain text.
    * Applying this method to non-plain-text can result in extremely lengthy
@@ -69,36 +65,42 @@
    * @return Array of <code>Outlink</code>s within found in plainText
    */
   public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
+    // Check if the pattern was already compiled
+    if (pattern == null) {
+      // Compile
+      pattern = Pattern.compile(URL_PATTERN, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
+    }
+
     return OutlinkExtractor.getOutlinks(plainText, "", conf);
   }
 
   /**
    * Extracts <code>Outlink</code> from given plain text and adds anchor
    * to the extracted <code>Outlink</code>s
-   * 
+   *
    * @param plainText the plain text from wich URLs should be extracted.
    * @param anchor    the anchor of the url
-   * 
+   *
    * @return Array of <code>Outlink</code>s within found in plainText
    */
   public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) {
+
     long start = System.currentTimeMillis();
     final List<Outlink> outlinks = new ArrayList<Outlink>();
 
+    // Test fails without this check
+    if (plainText == null)
+    {
+      return new Outlink[0];
+    }
+
     try {
-      final PatternCompiler cp = new Perl5Compiler();
-      final Pattern pattern = cp.compile(URL_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final PatternMatcher matcher = new Perl5Matcher();
+      Matcher matcher = pattern.matcher(plainText);
 
-      final PatternMatcherInput input = new PatternMatcherInput(plainText);
-
-      MatchResult result;
       String url;
 
-      //loop the matches
-      while (matcher.contains(input, pattern)) {
+      // Check if we found matches
+      while (matcher.find()) {
         // if this is taking too long, stop matching
         //   (SHOULD really check cpu time used so that heavily loaded systems
         //   do not unnecessarily hit this limit.)
@@ -108,8 +110,9 @@
           }
           break;
         }
-        result = matcher.getMatch();
-        url = result.group(0);
+
+        url = matcher.group(0);
+
         try {
           outlinks.add(new Outlink(url, anchor));
         } catch (MalformedURLException mue) {
@@ -132,9 +135,74 @@
 
     return retval;
   }
-  
 
   /**
+   * Extracts <code>Outlink</code> from given plain text and adds anchor
+   * to the extracted <code>Outlink</code>s
+   * This Method takes the Apache ORO Regexp API.
+   *
+   * @param plainText the plain text from wich URLs should be extracted.
+   * @param anchor    the anchor of the url
+   *
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinksApacheOroImpl(final String plainText, String anchor, Configuration conf) {
+
+    throw new UnsupportedOperationException(
+        "Implementation commented out. Please uncomment to use it.");
+
+//     long start = System.currentTimeMillis();
+//     final List<Outlink> outlinks = new ArrayList<Outlink>();
+// 
+//     try {
+//       final PatternCompiler cp = new Perl5Compiler();
+//       final Pattern pattern = cp.compile(URL_PATTERN,
+//           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+//               | Perl5Compiler.MULTILINE_MASK);
+//       final PatternMatcher matcher = new Perl5Matcher();
+// 
+//       final PatternMatcherInput input = new PatternMatcherInput(plainText);
+// 
+//       MatchResult result;
+//       String url;
+// 
+//       //loop the matches
+//       while (matcher.contains(input, pattern)) {
+//         // if this is taking too long, stop matching
+//         //   (SHOULD really check cpu time used so that heavily loaded systems
+//         //   do not unnecessarily hit this limit.)
+//         if (System.currentTimeMillis() - start >= 60000L) {
+//           if (LOG.isWarnEnabled()) {
+//             LOG.warn("Time limit exceeded for getOutLinks");
+//           }
+//           break;
+//         }
+//         result = matcher.getMatch();
+//         url = result.group(0);
+//         try {
+//           outlinks.add(new Outlink(url, anchor));
+//         } catch (MalformedURLException mue) {
+//           LOG.warn("Invalid url: '" + url + "', skipping.");
+//         }
+//       }
+//     } catch (Exception ex) {
+//       // if the matcher fails (perhaps a malformed URL) we just log it and move on
+//       if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+//     }
+// 
+//     final Outlink[] retval;
+// 
+//     //create array of the Outlinks
+//     if (outlinks != null && outlinks.size() > 0) {
+//       retval = outlinks.toArray(new Outlink[0]);
+//     } else {
+//       retval = new Outlink[0];
+//     }
+// 
+//     return retval;
+  }
+
+  /**
    * Extracts outlinks from a plain text. <br />
    * This Method takes the Jakarta Regexp API.
    * 
