/*
 * Decompiled with CFR 0.152.
 */
package cc.mallet.share.casutton.ner;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import java.util.regex.Pattern;

public class ConllNer2003Sentence2TokenSequence
extends Pipe {
    static final String[] endings = new String[]{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};
    static Pattern[] endingPatterns = new Pattern[endings.length];
    static final String[][][] endingNames = new String[2][3][endings.length];
    boolean saveSource;
    boolean doConjunctions;
    boolean doTags;
    boolean doPhrases;
    boolean doSpelling;
    boolean doDigitCollapses;
    boolean doDowncasing;
    private static final long serialVersionUID = -7326674871670572522L;

    public ConllNer2003Sentence2TokenSequence() {
        super(null, new LabelAlphabet());
        for (int i = 0; i < endings.length; ++i) {
            ConllNer2003Sentence2TokenSequence.endingPatterns[i] = Pattern.compile(".*" + endings[i] + "$");
            for (int j = 0; j < 3; ++j) {
                for (int k = 0; k < 2; ++k) {
                    ConllNer2003Sentence2TokenSequence.endingNames[k][j][i] = "W" + (k == 1 ? "-" : "") + j + "=<END" + endings[i] + ">";
                }
            }
        }
        this.saveSource = true;
        this.doConjunctions = false;
        this.doTags = true;
        this.doPhrases = true;
        this.doSpelling = false;
        this.doDigitCollapses = true;
        this.doDowncasing = false;
    }

    public ConllNer2003Sentence2TokenSequence(boolean useTags, boolean usePhrases) {
        super(null, new LabelAlphabet());
        for (int i = 0; i < endings.length; ++i) {
            ConllNer2003Sentence2TokenSequence.endingPatterns[i] = Pattern.compile(".*" + endings[i] + "$");
            for (int j = 0; j < 3; ++j) {
                for (int k = 0; k < 2; ++k) {
                    ConllNer2003Sentence2TokenSequence.endingNames[k][j][i] = "W" + (k == 1 ? "-" : "") + j + "=<END" + endings[i] + ">";
                }
            }
        }
        this.saveSource = true;
        this.doConjunctions = false;
        this.doTags = true;
        this.doPhrases = true;
        this.doSpelling = false;
        this.doDigitCollapses = true;
        this.doDowncasing = false;
        this.doTags = useTags;
        this.doPhrases = usePhrases;
    }

    /*
     * Unable to fully structure code
     */
    @Override
    public Instance pipe(Instance carrier) {
        sentenceLines = (String)carrier.getData();
        tokens = sentenceLines.split("\n");
        target = new LabelSequence((LabelAlphabet)this.getTargetAlphabet(), tokens.length);
        ending = new boolean[3][ConllNer2003Sentence2TokenSequence.endings.length];
        endingp1 = new boolean[3][ConllNer2003Sentence2TokenSequence.endings.length];
        endingp2 = new boolean[3][ConllNer2003Sentence2TokenSequence.endings.length];
        source = this.saveSource != false ? new StringBuffer() : null;
        data = new StringTokenization(source);
        prevLabel = "NOLABEL";
        ipattern = Pattern.compile("I-.*");
        tag = null;
        phrase = null;
        label = null;
        for (i = 0; i < tokens.length; ++i) {
            if (tokens[i].length() != 0) {
                try {
                    features = tokens[i].split(" ");
                    fieldIdx = 0;
                    word = features[fieldIdx++];
                    if (this.doTags) {
                        tag = features[fieldIdx++];
                    }
                    if (this.doPhrases) {
                        phrase = features[fieldIdx++];
                    }
                    if (!this.isTargetProcessing()) ** GOTO lbl33
                    label = features[fieldIdx++];
                }
                catch (ArrayIndexOutOfBoundsException e) {
                    throw new IllegalArgumentException("Invalid line " + tokens[i] + " : expected word " + (this.doTags != false ? ", tag" : "") + (this.doPhrases != false ? ", phrase" : "") + (this.isTargetProcessing() != false ? ", target" : "") + ".");
                }
            } else {
                word = "-<S>-";
                tag = "-<S>-";
                phrase = "-<S>-";
                label = "O";
            }
lbl33:
            // 3 sources

            if (this.doDigitCollapses) {
                if (word.matches("19\\d\\d")) {
                    word = "<YEAR>";
                } else if (word.matches("19\\d\\ds")) {
                    word = "<YEARDECADE>";
                } else if (word.matches("19\\d\\d-\\d+")) {
                    word = "<YEARSPAN>";
                } else if (word.matches("\\d+\\\\/\\d")) {
                    word = "<FRACTION>";
                } else if (word.matches("\\d[\\d,\\.]*")) {
                    word = "<DIGITS>";
                } else if (word.matches("19\\d\\d-\\d\\d-\\d--d")) {
                    word = "<DATELINEDATE>";
                } else if (word.matches("19\\d\\d-\\d\\d-\\d\\d")) {
                    word = "<DATELINEDATE>";
                } else if (word.matches(".*-led")) {
                    word = "<LED>";
                } else if (word.matches(".*-sponsored")) {
                    word = "<LED>";
                }
            }
            if (this.doDowncasing) {
                word = word.toLowerCase();
            }
            start = source.length();
            if (this.saveSource) {
                if (word.equals("-<S>-")) {
                    source.append("\n\n");
                }
                source.append(word);
                source.append(" ");
            }
            token = new StringSpan(source, start, source.length() - 1);
            if (this.doSpelling) {
                for (j = 0; j < ConllNer2003Sentence2TokenSequence.endings.length; ++j) {
                    ending[2][j] = ending[1][j];
                    ending[1][j] = ending[0][j];
                    ending[0][j] = ConllNer2003Sentence2TokenSequence.endingPatterns[j].matcher(word).matches();
                    if (!ending[0][j]) continue;
                    token.setFeatureValue(ConllNer2003Sentence2TokenSequence.endingNames[0][0][j], 1.0);
                }
            }
            if (this.doTags) {
                token.setFeatureValue("T=" + tag, 1.0);
            }
            if (this.doPhrases) {
                token.setFeatureValue("P=" + phrase, 1.0);
            }
            data.add(token);
            if (!this.isTargetProcessing()) continue;
            oldLabel = label;
            if (ipattern.matcher(label).matches() && (prevLabel.length() < 3 || !prevLabel.substring(2).equals(label.substring(2)))) {
                label = "B" + oldLabel.substring(1);
            }
            prevLabel = oldLabel;
            target.add(label);
        }
        carrier.setData(data);
        if (this.isTargetProcessing()) {
            carrier.setTarget(target);
        }
        if (this.saveSource) {
            carrier.setSource(source);
        }
        return carrier;
    }
}

