package org.cogroo.gc.cmdline.dictionary;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.postag.Triple;
import opennlp.tools.util.featuregen.StringPattern;
import org.cogroo.entities.impl.MorphologicalTag;
import org.cogroo.formats.ad.ADFeaturizerSampleStream;
import org.cogroo.interpreters.FlorestaTagInterpreter;
import org.cogroo.interpreters.JspellTagInterpreter;
import org.cogroo.interpreters.TagInterpreter;
import org.cogroo.tools.featurizer.FeatureSample;

/* loaded from: input_file:org/cogroo/gc/cmdline/dictionary/TabSeparatedPOSDictionaryBuilderTool.class */
public class TabSeparatedPOSDictionaryBuilderTool extends BasicCmdLineTool {
    private static final char HT = '\t';
    private static final char NL = '\n';

    /* loaded from: input_file:org/cogroo/gc/cmdline/dictionary/TabSeparatedPOSDictionaryBuilderTool$Params.class */
    interface Params extends POSDictionaryBuilderParams {
        @ArgumentParser.OptionalParameter(defaultValue = "false")
        @ArgumentParser.ParameterDescription(valueName = "includeFetures", description = "include features")
        Boolean getIsIncludeFeatures();

        @ArgumentParser.OptionalParameter(defaultValue = "false")
        @ArgumentParser.ParameterDescription(valueName = "includeFromCorpus", description = "include from corpus")
        Boolean getIncludeFromCorpus();

        @ArgumentParser.OptionalParameter(defaultValue = "false")
        @ArgumentParser.ParameterDescription(valueName = "expandME", description = "include from corpus")
        Boolean getExpandME();
    }

    public String getShortDescription() {
        return "builds a new tab separated lexical dictionary to be used with FSA builder";
    }

    public String getHelp() {
        return getBasicHelp(Params.class);
    }

    public void run(String[] strArr) {
        Params params = (Params) validateAndParseParams(strArr, Params.class);
        File inputFile = params.getInputFile();
        File outputFile = params.getOutputFile();
        File corpus = params.getCorpus();
        Charset encoding = params.getEncoding();
        CmdLineUtil.checkInputFile("dictionary input file", inputFile);
        CmdLineUtil.checkOutputFile("dictionary output file", outputFile);
        CmdLineUtil.checkInputFile("corpus input file", corpus);
        InputStreamReader inputStreamReader = null;
        OutputStreamWriter outputStreamWriter = null;
        try {
            try {
                try {
                    ADFeaturizerSampleStream aDFeaturizerSampleStream = new ADFeaturizerSampleStream(new FileInputStream(corpus), "ISO-8859-1", params.getExpandME().booleanValue());
                    HashSet hashSet = new HashSet();
                    HashSet hashSet2 = new HashSet();
                    for (FeatureSample read = aDFeaturizerSampleStream.read(); read != null; read = aDFeaturizerSampleStream.read()) {
                        Collections.addAll(hashSet, read.getFeatures());
                        Collections.addAll(hashSet2, read.getTags());
                    }
                    aDFeaturizerSampleStream.close();
                    inputStreamReader = new InputStreamReader(new FileInputStream(inputFile), encoding);
                    TreeMap treeMap = new TreeMap();
                    parseOneEntryPerLine(inputStreamReader, treeMap, new JspellTagInterpreter(), new FlorestaTagInterpreter(), hashSet, hashSet2, params.getAllowInvalidFeats().booleanValue(), params.getIsIncludeFeatures().booleanValue());
                    inputStreamReader.close();
                    TreeMap treeMap2 = new TreeMap();
                    if (params.getIncludeFromCorpus().booleanValue()) {
                        ADFeaturizerSampleStream aDFeaturizerSampleStream2 = new ADFeaturizerSampleStream(new FileInputStream(corpus), "ISO-8859-1", params.getExpandME().booleanValue());
                        for (FeatureSample read2 = aDFeaturizerSampleStream2.read(); read2 != null; read2 = aDFeaturizerSampleStream2.read()) {
                            String[] sentence = read2.getSentence();
                            String[] lemmas = read2.getLemmas();
                            String[] tags = read2.getTags();
                            String[] features = read2.getFeatures();
                            for (int i = 0; i < sentence.length; i++) {
                                String lowerCase = !"prop".equals(tags[i]) ? sentence[i].toLowerCase() : sentence[i];
                                if (isValid((Collection) treeMap.get(lowerCase), lowerCase, tags[i], lemmas[i], features[i], params.getIsIncludeFeatures().booleanValue())) {
                                    Triple asTriple = asTriple(tags[i], lemmas[i], features[i], params.getIsIncludeFeatures().booleanValue());
                                    put(lowerCase, asTriple, treeMap);
                                    if (!"prop".equals(asTriple.getClazz())) {
                                        if (!treeMap2.containsKey(lowerCase)) {
                                            treeMap2.put(lowerCase, new HashSet());
                                        }
                                        ((Set) treeMap2.get(lowerCase)).add(asTriple.toString());
                                    }
                                }
                            }
                        }
                        aDFeaturizerSampleStream2.close();
                        for (String str : treeMap2.keySet()) {
                            Iterator it = ((Set) treeMap2.get(str)).iterator();
                            while (it.hasNext()) {
                                System.out.println(str + " - " + ((String) it.next()));
                            }
                        }
                    }
                    outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8");
                    for (String str2 : treeMap.keySet()) {
                        Iterator it2 = ((Set) treeMap.get(str2)).iterator();
                        while (it2.hasNext()) {
                            outputStreamWriter.append((CharSequence) toString(str2, (Triple) it2.next()));
                        }
                    }
                    outputStreamWriter.close();
                    try {
                        inputStreamReader.close();
                        outputStreamWriter.close();
                    } catch (IOException e) {
                    }
                } catch (Throwable th) {
                    try {
                        inputStreamReader.close();
                        outputStreamWriter.close();
                    } catch (IOException e2) {
                    }
                    throw th;
                }
            } catch (IOException e3) {
                throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e3.getMessage());
            }
        } catch (Exception e4) {
            throw new TerminateToolException(-1, "Exception: " + e4.getMessage());
        }
    }

    private static void put(String str, Triple triple, SortedMap<String, Set<Triple>> sortedMap) {
        if (!sortedMap.containsKey(str)) {
            sortedMap.put(str, new HashSet());
        }
        sortedMap.get(str).add(triple);
    }

    private boolean isValid(Collection<Triple> collection, String str, String str2, String str3, String str4, boolean z) {
        if (StringPattern.recognize(str).containsDigit() || str2.startsWith("B-") || str2.startsWith("I-")) {
            return false;
        }
        if (collection == null || collection.size() <= 0) {
            return true;
        }
        HashSet hashSet = new HashSet();
        for (Triple triple : collection) {
            String str5 = null;
            if (z) {
                str5 = triple.getFeats();
            }
            hashSet.add(triple.getClazz() + "|" + str5);
        }
        return !hashSet.contains(new StringBuilder().append(str2).append("|").append(z ? str4 : null).toString());
    }

    public static void parseOneEntryPerLine(Reader reader, SortedMap<String, Set<Triple>> sortedMap, TagInterpreter tagInterpreter, TagInterpreter tagInterpreter2, Set<String> set, Set<String> set2, boolean z, boolean z2) throws IOException {
        TreeSet treeSet = new TreeSet(set);
        if (!z2) {
            z = true;
        }
        BufferedReader bufferedReader = new BufferedReader(reader);
        TreeSet treeSet2 = new TreeSet();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            StringTokenizer stringTokenizer = new StringTokenizer(readLine, " ");
            String nextToken = stringTokenizer.nextToken();
            while (stringTokenizer.hasMoreTokens()) {
                String nextToken2 = stringTokenizer.nextToken();
                String[] split = nextToken2.split(">");
                if (split.length != 2) {
                    System.err.println("** Invalid lemmatag. " + nextToken + " -> " + nextToken2);
                } else {
                    MorphologicalTag parseMorphologicalTag = tagInterpreter.parseMorphologicalTag(split[1]);
                    if (parseMorphologicalTag == null || parseMorphologicalTag.getClazzE() == null) {
                        System.err.println("-- Missing class tag. " + nextToken + " -> " + nextToken2);
                    } else {
                        MorphologicalTag morphologicalTag = new MorphologicalTag();
                        morphologicalTag.setClazz(parseMorphologicalTag.getClazzE());
                        String serialize = tagInterpreter2.serialize(morphologicalTag);
                        if (serialize == null) {
                            System.out.println("erro :(");
                        }
                        MorphologicalTag m4clone = parseMorphologicalTag.m4clone();
                        m4clone.setClazz(null);
                        String str = null;
                        if (!m4clone.isEmpty()) {
                            str = tagInterpreter2.serialize(m4clone);
                        }
                        if (str == null || str.length() == 0) {
                            str = "-";
                        }
                        if (!serialize.startsWith("v-") || !nextToken.contains("-")) {
                            if ("pron".equals(serialize)) {
                                if (treeSet.contains(str) || z) {
                                    put(nextToken, asTriple("pron-det", split[0], str, z2), sortedMap);
                                    put(nextToken, asTriple("pron-indp", split[0], str, z2), sortedMap);
                                }
                            } else if (serialize != null && set2.contains(serialize) && (treeSet.contains(str) || z)) {
                                put(nextToken, asTriple(serialize, split[0], str, z2), sortedMap);
                            } else {
                                if ("pnt".equals(serialize) && set2.contains(nextToken)) {
                                    put(nextToken, asTriple(nextToken, nextToken, null, z2), sortedMap);
                                } else if (!serialize.startsWith("v-")) {
                                    System.err.println("unknown - " + nextToken + " -> " + new Triple(serialize, split[0], serialize + "_" + str));
                                }
                                treeSet2.add(serialize + "_" + str);
                            }
                        }
                    }
                }
            }
        }
        if (treeSet.size() > 0) {
            System.err.print("Known tags:");
            Iterator it = treeSet.iterator();
            while (it.hasNext()) {
                System.err.print(" " + ((String) it.next()));
            }
            System.err.println();
        }
        if (treeSet2.size() > 0) {
            System.err.print("Found unknown tags:");
            Iterator it2 = treeSet2.iterator();
            while (it2.hasNext()) {
                System.err.print(" " + ((String) it2.next()));
            }
            System.err.println();
        }
    }

    private static Triple asTriple(String str, String str2, String str3, boolean z) {
        return z ? new Triple(str, str2, str3) : new Triple(str, str2, (String) null);
    }

    private static String toString(String str, Triple triple) {
        StringBuilder sb = new StringBuilder();
        sb.append(str).append('\t').append(triple.getLemma()).append('\t').append(triple.getClazz());
        if (triple.getFeats() != null && triple.getFeats().length() > 0) {
            sb.append("#").append(triple.getFeats());
        }
        sb.append('\n');
        return sb.toString();
    }
}
