/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import static org.junit.Assert.assertEquals;
public class StrictHtmlEncodingDetectorTest {
private Metadata metadata = new Metadata();
@Before
public void setUp() {
this.metadata = new Metadata();
}
@Test
public void basic() throws IOException {
assertWindows1252("");
}
@Test
public void duplicateMeta() throws IOException {
assertWindows1252("" +
"");
}
@Test
public void httpEquiv() throws IOException {
assertWindows1252(""); // quotes around the charset are allowed
assertWindows1252("");
}
@Test
public void httpEquivDuplicateCharset() throws IOException {
assertWindows1252("");
}
@Test
public void htmlFragment() throws IOException {
assertWindows1252("
");
}
@Test
public void verBadHtml() throws IOException {
// check that the parser is not confused by garbage before the declaration
assertWindows1252("<< l \" == / '=x\n >" +
" " +
"< " +
"" +
"" +
"");
}
@Test
public void incompleteMeta() throws IOException {
assertWindows1252("' at the end
}
@Test
public void charsetWithWhiteSpaces() throws IOException {
assertWindows1252("");
}
@Test
public void mixedCase() throws IOException {
assertWindows1252("");
}
@Test
public void utf16() throws IOException {
// According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
assertCharset("", StandardCharsets.UTF_8);
}
@Test
public void xUserDefined() throws IOException {
// According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
assertWindows1252("");
}
@Test
public void iso88591() throws IOException {
// In the spec, iso-8859-1 is an alias for WINDOWS-1252
assertWindows1252("");
}
@Test
public void macintoshEncoding() throws IOException {
// The mac roman encoding exists in java, but under the name x-MacRoman
assertCharset("", Charset.forName("x-MacRoman"));
}
@Test
public void bom() throws IOException {
// A BOM should have precedence over the meta
assertCharset("\ufeff", StandardCharsets.UTF_8);
assertCharset("\ufeff", StandardCharsets.UTF_16LE);
assertCharset("\ufeff", StandardCharsets.UTF_16BE);
}
@Test
public void withSlash() throws IOException {
assertWindows1252("");
}
@Test
public void insideDescription() throws IOException {
assertWindows1252("" +
"");
}
@Test
public void insideTag() throws IOException {
assertWindows1252("\" " + // inside attribute
"" + // tag end
"");
}
@Test
public void missingAttribute() throws IOException {
assertWindows1252(
"" + // missing http-equiv attribute
"" // valid declaration
);
}
@Test
public void insideSpecialTag() throws IOException {
// Content inside , " + // inside special tag
"" // real charset declaration
);
}
@Test
public void spaceBeforeTag() throws IOException {
assertWindows1252(
"< meta charset='UTF-8'>" + // invalid charset declaration
"" // real charset declaration
);
}
@Test
public void invalidAttribute() throws IOException {
assertWindows1252(
"" // real charset declaration
);
}
@Test
public void unmatchedQuote() throws IOException {
assertWindows1252(
"" + // invalid charset declaration
"" // real charset declaration
);
}
@Test
public void realWorld() throws IOException {
assertWindows1252("\n" +
"\n" +
"\n" +
"\n" +
"Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U\n" +
"\n" +
"\n" +
"\n" +
"\n" +
"\n" +
" is a valid comment
assertWindows1252(
"" + // end comment
"" + // compact comment
"" // outside comment, charset declaration
);
}
@Test
public void withUserProvidedCharset() throws IOException {
metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
// ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
assertWindows1252("");
assertWindows1252("");
assertWindows1252("");
// if a BOM is present, it has precedence over transport layer information
assertCharset("\ufeff", StandardCharsets.UTF_8);
assertCharset("\ufeff", StandardCharsets.UTF_16LE);
assertCharset("\ufeff", StandardCharsets.UTF_16BE);
}
@Test
public void throwResistance() throws IOException {
// The preprocessing should return right after having found the charset
// So if an error is thrown in the stream AFTER the declaration,
// it shouldn't see it
assertWindows1252(throwAfter(""));
assertWindows1252(throwAfter("