test page

Index: src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java =================================================================== --- src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java (revision 0) +++ src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java (revision 0) @@ -0,0 +1,183 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import junit.framework.TestCase; + +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.tika.HTMLMetaProcessor; + +import java.io.ByteArrayInputStream; +import java.net.URL; + +import org.cyberneko.html.parsers.*; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** Unit tests for HTMLMetaProcessor. */ +public class TestRobotsMetaProcessor extends TestCase { + public TestRobotsMetaProcessor(String name) { + super(name); + } + + /* + + some sample tags: + + + + + + + + + */ + + + public static String[] tests= + { + "test page" + + " " + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + "" + + " some text" + + "", + + "test page" + + " " + + "" + + "" + + " some text" + + "", + + }; + + public static final boolean[][] answers= { + {true, true, true}, // NONE + {false, false, true}, // all + {true, true, true}, // nOnE + {true, true, false}, // none + {true, true, false}, // noindex,nofollow + {true, false, false}, // noindex,follow + {false, true, false}, // index,nofollow + {false, false, false}, // index,follow + {false, false, false}, // missing! + }; + + private URL[][] currURLsAndAnswers; + + public void testRobotsMetaProcessor() { + DOMFragmentParser parser= new DOMFragmentParser();; + + try { + currURLsAndAnswers= new URL[][] { + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org"), null}, + {new URL("http://www.nutch.org/foo/"), + new URL("http://www.nutch.org/")}, + {new URL("http://www.nutch.org"), + new URL("http://www.nutch.org/base/")} + }; + } catch (Exception e) { + assertTrue("couldn't make test URLs!", false); + } + + for (int i= 0; i < tests.length; i++) { + byte[] bytes= tests[i].getBytes(); + + DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); + + try { + parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node); + } catch (Exception e) { + e.printStackTrace(); + } + + HTMLMetaTags robotsMeta= new HTMLMetaTags(); + HTMLMetaProcessor.getMetaTags(robotsMeta, node, + currURLsAndAnswers[i][0]); + + assertTrue("got index wrong on test " + i, + robotsMeta.getNoIndex() == answers[i][0]); + assertTrue("got follow wrong on test " + i, + robotsMeta.getNoFollow() == answers[i][1]); + assertTrue("got cache wrong on test " + i, + robotsMeta.getNoCache() == answers[i][2]); + assertTrue("got base href wrong on test " + i + " (got " + + robotsMeta.getBaseHref() + ")", + ( (robotsMeta.getBaseHref() == null) + && (currURLsAndAnswers[i][1] == null) ) + || ( (robotsMeta.getBaseHref() != null) + && robotsMeta.getBaseHref().equals( + currURLsAndAnswers[i][1]) ) ); + + } + } + +} Index: src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java =================================================================== --- src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java (revision 0) +++ src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java (revision 0) @@ -0,0 +1,331 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import junit.framework.TestCase; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.tika.DOMContentUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import java.io.ByteArrayInputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import org.cyberneko.html.parsers.*; +import org.xml.sax.*; +import org.w3c.dom.*; +import org.apache.html.dom.*; + +/** + * Unit tests for DOMContentUtils. + */ +public class TestDOMContentUtils extends TestCase { + + private static final String[] testPages = { + + new String(" title " + + " body " + + " anchor " + ""), + + new String(" title " + + " body " + " home " + + "" + " " + " bots " + + ""), + + new String(" " + " " + + " separate this " + " from this" + + "" + ""), + + // this one relies on certain neko fixup behavior, possibly + // distributing the anchors into the LI's-but not the other + // anchors (outside of them, instead)! So you get a tree that + // looks like: + // ...

+ //

+ //

+ new String(" my title " + + " body " + "

" + + ""), + + // test frameset link extraction. The invalid frame in the middle + // will be + // fixed to a third standalone frame. + new String(" my title " + + " " + "" + + "" + "" + + "" + "" + + "" + "" + "" + + "" + "" + ""), + + // test and