/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.html; import org.apache.tika.metadata.Metadata; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import java.io.*; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import static org.junit.Assert.assertEquals; public class StrictHtmlEncodingDetectorTest { private Metadata metadata = new Metadata(); @Before public void setUp() { this.metadata = new Metadata(); } @Test public void basic() throws IOException { assertWindows1252(""); } @Test public void duplicateMeta() throws IOException { assertWindows1252("" + ""); } @Test public void httpEquiv() throws IOException { assertWindows1252(""); // quotes around the charset are allowed assertWindows1252(""); } @Test public void httpEquivDuplicateCharset() throws IOException { assertWindows1252(""); } @Test public void htmlFragment() throws IOException { assertWindows1252(""); } @Test public void verBadHtml() throws IOException { // check that the parser is not confused by garbage before the declaration assertWindows1252("<< l \" == / '=x\n >" + " " + "< " + "" + "" + ""); } @Test public void incompleteMeta() throws IOException { assertWindows1252("' at the end } @Test public void charsetWithWhiteSpaces() throws IOException { assertWindows1252(""); } @Test public void mixedCase() throws IOException { assertWindows1252(""); } @Test public void utf16() throws IOException { // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.' assertCharset("", StandardCharsets.UTF_8); } @Test public void xUserDefined() throws IOException { // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.' assertWindows1252(""); } @Test public void iso88591() throws IOException { // In the spec, iso-8859-1 is an alias for WINDOWS-1252 assertWindows1252(""); } @Test public void macintoshEncoding() throws IOException { // The mac roman encoding exists in java, but under the name x-MacRoman assertCharset("", Charset.forName("x-MacRoman")); } @Test public void bom() throws IOException { // A BOM should have precedence over the meta assertCharset("\ufeff", StandardCharsets.UTF_8); assertCharset("\ufeff", StandardCharsets.UTF_16LE); assertCharset("\ufeff", StandardCharsets.UTF_16BE); } @Test public void withSlash() throws IOException { assertWindows1252(""); } @Test public void insideDescription() throws IOException { assertWindows1252("" + ""); } @Test public void insideTag() throws IOException { assertWindows1252("\" " + // inside attribute "" + // tag end ""); } @Test public void missingAttribute() throws IOException { assertWindows1252( "" + // missing http-equiv attribute "" // valid declaration ); } @Test public void insideSpecialTag() throws IOException { // Content inside " + // inside special tag "" // real charset declaration ); } @Test public void spaceBeforeTag() throws IOException { assertWindows1252( "< meta charset='UTF-8'>" + // invalid charset declaration "" // real charset declaration ); } @Test public void invalidAttribute() throws IOException { assertWindows1252( "" // real charset declaration ); } @Test public void unmatchedQuote() throws IOException { assertWindows1252( "" + // invalid charset declaration "" // real charset declaration ); } @Test public void realWorld() throws IOException { assertWindows1252("\n" + "\n" + "\n" + "\n" + "Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U\n" + "\n" + "\n" + "\n" + "\n" + "\n" + " is a valid comment assertWindows1252( "" + // end comment "" + // compact comment "" // outside comment, charset declaration ); } @Test public void withUserProvidedCharset() throws IOException { metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1"); // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level assertWindows1252(""); assertWindows1252(""); assertWindows1252(""); // if a BOM is present, it has precedence over transport layer information assertCharset("\ufeff", StandardCharsets.UTF_8); assertCharset("\ufeff", StandardCharsets.UTF_16LE); assertCharset("\ufeff", StandardCharsets.UTF_16BE); } @Test public void throwResistance() throws IOException { // The preprocessing should return right after having found the charset // So if an error is thrown in the stream AFTER the declaration, // it shouldn't see it assertWindows1252(throwAfter("")); assertWindows1252(throwAfter("