package ir.ac.iust.htmlchardet;

import com.ibm.icu.text.CharsetDetector;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Iterator;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.mozilla.intl.chardet.nsDetector;

/* loaded from: input_file:ir/ac/iust/htmlchardet/HTMLCharsetDetector.class */
public class HTMLCharsetDetector {
    private static final Logger LOG = Logger.getLogger(HTMLCharsetDetector.class);
    private static final int threshold = 40;

    private HTMLCharsetDetector() {
    }

    public static String detect(byte[] bArr, boolean... zArr) {
        Document document = null;
        if (zArr != null && zArr.length > 0 && zArr[0]) {
            document = createDomTree(bArr, "ISO-8859-1");
            String lookInMetaTags = lookInMetaTags(document);
            if (Charsets.isValid(lookInMetaTags)) {
                return Charsets.normalize(lookInMetaTags);
            }
        }
        String mozillaJCharDet = mozillaJCharDet(bArr);
        if (mozillaJCharDet.equalsIgnoreCase("UTF-8")) {
            return Charsets.normalize(mozillaJCharDet);
        }
        if (document == null) {
            document = createDomTree(bArr, "ISO-8859-1");
        }
        byte[] bArr2 = null;
        try {
            bArr2 = document.text().getBytes("ISO-8859-1");
        } catch (UnsupportedEncodingException e) {
            LOG.warn("Could not extract byte sequence from visible text of the html document using \"ISO-8859-1\" charset. Detection process will use the raw html byte sequence as input.", e);
        }
        if (bArr2 == null || bArr2.length < threshold) {
            bArr2 = bArr;
        }
        return ibmICU4j(bArr2);
    }

    private static Document createDomTree(byte[] bArr, String str) {
        return Jsoup.parse(new String(bArr, Charset.forName(str)));
    }

    private static String lookInMetaTags(Document document) {
        Iterator it = document.select("meta").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            String attr = element.attr("charset");
            if (Charsets.isValid(attr)) {
                return attr;
            }
            String attr2 = element.attr("content");
            if (attr2.contains("charset")) {
                String trim = attr2.substring(attr2.indexOf("charset=") + 8, attr2.length()).trim();
                if (Charsets.isValid(trim)) {
                    return trim;
                }
            }
        }
        return null;
    }

    private static String mozillaJCharDet(byte[] bArr) {
        nsDetector nsdetector = new nsDetector(0);
        nsdetector.DoIt(bArr, bArr.length, false);
        nsdetector.DataEnd();
        return nsdetector.getProbableCharsets()[0];
    }

    private static String ibmICU4j(byte[] bArr) {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText(bArr);
        return charsetDetector.detect().getName();
    }

    @Deprecated
    public static String detect(byte[] bArr) {
        if (mozillaJCharDet(bArr).equalsIgnoreCase(Charsets.UTF_8.getValue())) {
            return Charsets.UTF_8.getValue();
        }
        byte[] bArr2 = new byte[bArr.length * 2];
        int i = 0;
        int findPattern = findPattern(bArr, "<body", 0);
        while (true) {
            int i2 = findPattern;
            if (i2 == -1) {
                break;
            }
            int findPattern2 = findPattern(bArr, "/body>", i2);
            for (int i3 = i2 + 6; i3 < findPattern2 - 1; i3++) {
                bArr2[i] = bArr[i3];
                i++;
            }
            findPattern = findPattern(bArr2, "<body", 0);
        }
        int findPattern3 = findPattern(bArr2, "<script", 0);
        while (true) {
            int i4 = findPattern3;
            if (i4 == -1) {
                break;
            }
            int i5 = i4 - 1;
            for (int findPattern4 = findPattern(bArr2, "/script>", i4) + 8; findPattern4 < bArr2.length; findPattern4++) {
                bArr2[i5] = bArr2[findPattern4];
                i5++;
            }
            findPattern3 = findPattern(bArr2, "<script", 0);
        }
        int findPattern5 = findPattern(bArr2, "<style", 0);
        while (true) {
            int i6 = findPattern5;
            if (i6 == -1) {
                return ibmICU4j(removeTags(bArr2));
            }
            int i7 = i6 - 1;
            for (int findPattern6 = findPattern(bArr2, "/style>", i6) + 8; findPattern6 < bArr2.length; findPattern6++) {
                bArr2[i7] = bArr2[findPattern6];
                i7++;
            }
            findPattern5 = findPattern(bArr2, "<script", 0);
        }
    }

    private static int findPattern(byte[] bArr, String str, int i) {
        int length = str.length();
        char[] cArr = new char[length];
        str.toLowerCase().getChars(0, length, cArr, 0);
        char[] cArr2 = new char[length];
        str.toUpperCase().getChars(0, length, cArr2, 0);
        for (int i2 = i; (i2 + length) - 1 < bArr.length; i2++) {
            boolean z = true;
            int i3 = 0;
            while (true) {
                if (i3 >= length) {
                    break;
                }
                char c = (char) bArr[i2 + i3];
                if (c != cArr[i3] && c != cArr2[i3]) {
                    z = false;
                    break;
                }
                i3++;
            }
            if (z) {
                return i2;
            }
        }
        return -1;
    }

    private static byte[] removeTags(byte[] bArr) {
        int i = 0;
        byte[] bArr2 = new byte[bArr.length];
        int i2 = 0;
        for (int i3 = 0; i3 < bArr.length; i3++) {
            switch (bArr[i3]) {
                case 60:
                    i++;
                    break;
                case 61:
                default:
                    if (i == 0) {
                        int i4 = i2;
                        i2++;
                        bArr2[i4] = bArr[i3];
                        break;
                    } else {
                        break;
                    }
                case 62:
                    i--;
                    break;
            }
        }
        byte[] bArr3 = new byte[i2];
        System.arraycopy(bArr2, 0, bArr3, 0, i2);
        return bArr3;
    }
}
