/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.patterns.surface;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.patterns.DataInstance;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.sequences.CoNLLDocumentReaderAndWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.TypesafeMap;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class AnnotatedTextReader {
    public static Map<String, DataInstance> parseColumnFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) {
        CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
        Properties props = new Properties();
        SeqClassifierFlags flags = new SeqClassifierFlags(props);
        flags.entitySubclassification = "noprefix";
        flags.retainEntitySubclassification = false;
        conllreader.init(flags);
        Iterator<List<CoreLabel>> dociter = conllreader.getIterator(reader);
        int num = -1;
        HashMap<String, DataInstance> sents = new HashMap<String, DataInstance>();
        while (dociter.hasNext()) {
            List<CoreLabel> doc = dociter.next();
            ArrayList<String> words = new ArrayList<String>();
            ArrayList<CoreLabel> sentcore = new ArrayList<CoreLabel>();
            int tokenindex = 0;
            for (CoreLabel l : doc) {
                if (l.word().equals("*BOUNDARY*") || l.word().equals("-DOCSTART-")) {
                    if (words.size() <= 0) continue;
                    String docid = sentIDprefix + "-" + String.valueOf(++num);
                    DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
                    sents.put(docid, sentInst);
                    words = new ArrayList();
                    sentcore = new ArrayList();
                    tokenindex = 0;
                    continue;
                }
                words.add(l.word());
                l.set(CoreAnnotations.IndexAnnotation.class, ++tokenindex);
                l.set(CoreAnnotations.ValueAnnotation.class, l.word());
                String label = (String)l.get(CoreAnnotations.AnswerAnnotation.class);
                assert (label != null) : "label cannot be null";
                l.set(CoreAnnotations.TextAnnotation.class, l.word());
                l.set(CoreAnnotations.OriginalTextAnnotation.class, l.word());
                if (setGoldClass) {
                    l.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
                }
                if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label)) {
                    l.set(setClassForTheseLabels.get(label), label);
                }
                sentcore.add(l);
            }
            if (words.size() <= 0) continue;
            String docid = sentIDprefix + "-" + String.valueOf(++num);
            DataInstance sentInst = DataInstance.getNewSurfaceInstance(sentcore);
            sents.put(docid, sentInst);
        }
        return sents;
    }

    public static List<CoreMap> parseFile(BufferedReader reader, Set<String> categoriesAllowed, Map<String, Class<? extends TypesafeMap.Key<String>>> setClassForTheseLabels, boolean setGoldClass, String sentIDprefix) throws IOException {
        Pattern startingLabelToken = Pattern.compile("<(" + StringUtils.join(categoriesAllowed, "|") + ")>");
        Pattern endLabelToken = Pattern.compile("</(" + StringUtils.join(categoriesAllowed, "|") + ")>");
        String backgroundSymbol = "O";
        ArrayList<CoreMap> sentences = new ArrayList<CoreMap>();
        int lineNum = -1;
        String l = null;
        while ((l = reader.readLine()) != null) {
            ++lineNum;
            String[] t = l.split("\t", 2);
            String id = null;
            String text = null;
            if (t.length == 2) {
                id = t[0];
                text = t[1];
            } else if (t.length == 1) {
                text = t[0];
                id = String.valueOf(lineNum);
            }
            id = sentIDprefix + id;
            DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
            PTBTokenizer.PTBTokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
            dp.setTokenizerFactory(tokenizerFactory);
            String label = backgroundSymbol;
            int sentNum = -1;
            for (List<HasWord> sentence : dp) {
                ++sentNum;
                String sentStr = "";
                ArrayList<CoreLabel> sent = new ArrayList<CoreLabel>();
                for (HasWord tokw : sentence) {
                    String tok = tokw.word();
                    Matcher startingMatcher = startingLabelToken.matcher(tok);
                    Matcher endMatcher = endLabelToken.matcher(tok);
                    if (startingMatcher.matches()) {
                        label = startingMatcher.group(1);
                        continue;
                    }
                    if (endMatcher.matches()) {
                        label = backgroundSymbol;
                        continue;
                    }
                    CoreLabel c = new CoreLabel();
                    ArrayList<String> toks = new ArrayList<String>();
                    toks.add(tok);
                    for (String toksplit : toks) {
                        sentStr = sentStr + " " + toksplit;
                        c.setWord(toksplit);
                        c.setLemma(toksplit);
                        c.setValue(toksplit);
                        c.set(CoreAnnotations.TextAnnotation.class, toksplit);
                        c.set(CoreAnnotations.OriginalTextAnnotation.class, tok);
                        if (setGoldClass) {
                            c.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
                        }
                        if (setClassForTheseLabels != null && setClassForTheseLabels.containsKey(label)) {
                            c.set(setClassForTheseLabels.get(label), label);
                        }
                        sent.add(c);
                    }
                }
                ArrayCoreMap sentcm = new ArrayCoreMap();
                sentcm.set(CoreAnnotations.TextAnnotation.class, sentStr.trim());
                sentcm.set(CoreAnnotations.TokensAnnotation.class, sent);
                sentcm.set(CoreAnnotations.DocIDAnnotation.class, id + "-" + sentNum);
                sentences.add(sentcm);
            }
        }
        return sentences;
    }
}

