/*
 * Decompiled with CFR 0.152.
 */
package opennlp.uima.tokenize;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import opennlp.maxent.GIS;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.model.BaseModel;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
import opennlp.uima.util.OpennlpUtil;
import opennlp.uima.util.SampleTraceStream;
import opennlp.uima.util.UimaUtil;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FSMatchConstraint;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

public final class TokenizerTrainer
extends CasConsumer_ImplBase {
    public static final String IS_ALPHA_NUMERIC_OPTIMIZATION = "opennlp.uima.tokenizer.IsAlphaNumericOptimization";
    private List<TokenSample> tokenSamples = new ArrayList<TokenSample>();
    private UimaContext mContext;
    private Type mSentenceType;
    private Type mTokenType;
    private String mModelName;
    private String additionalTrainingDataFile;
    private String additionalTrainingDataEncoding;
    private String language;
    private Boolean isSkipAlphaNumerics;
    private Logger mLogger;
    private String sampleTraceFileEncoding;
    private File sampleTraceFile;

    public void initialize() throws ResourceInitializationException {
        String sampleTraceFileName;
        super.initialize();
        this.mContext = this.getUimaContext();
        this.mLogger = this.mContext.getLogger();
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Initializing the OpenNLP Tokenizer trainer.");
        }
        this.mModelName = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.MODEL_PARAMETER);
        this.language = CasConsumerUtil.getRequiredStringParameter(this.mContext, "opennlp.uima.Language");
        this.isSkipAlphaNumerics = CasConsumerUtil.getOptionalBooleanParameter(this.mContext, IS_ALPHA_NUMERIC_OPTIMIZATION);
        if (this.isSkipAlphaNumerics == null) {
            this.isSkipAlphaNumerics = false;
        }
        this.additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(this.getUimaContext(), "opennlp.uima.AdditionalTrainingDataFile");
        if (this.additionalTrainingDataFile != null) {
            this.additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(this.getUimaContext(), "opennlp.uima.AdditionalTrainingDataEncoding");
        }
        if ((sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(this.getUimaContext(), "opennlp.uima.SampleTraceFile")) != null) {
            this.sampleTraceFile = new File(this.getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + sampleTraceFileName);
            this.sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(this.getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
        }
    }

    public void typeSystemInit(TypeSystem typeSystem) throws ResourceInitializationException {
        String sentenceTypeName = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.SENTENCE_TYPE_PARAMETER);
        this.mSentenceType = CasConsumerUtil.getType(typeSystem, sentenceTypeName);
        String tokenTypeName = CasConsumerUtil.getRequiredStringParameter(this.mContext, "opennlp.uima.TokenType");
        this.mTokenType = CasConsumerUtil.getType(typeSystem, tokenTypeName);
    }

    public void processCas(CAS cas) {
        AnnotationIndex sentenceAnnotations = cas.getAnnotationIndex(this.mSentenceType);
        for (AnnotationFS sentence : sentenceAnnotations) {
            this.process(cas, sentence);
        }
    }

    private void process(CAS tcas, AnnotationFS sentence) {
        AnnotationIndex allTokens = tcas.getAnnotationIndex(this.mTokenType);
        ContainingConstraint containingConstraint = new ContainingConstraint(sentence);
        FSIterator containingTokens = tcas.createFilteredIterator(allTokens.iterator(), (FSMatchConstraint)containingConstraint);
        LinkedList<Span> openNLPSpans = new LinkedList<Span>();
        while (containingTokens.hasNext()) {
            AnnotationFS tokenAnnotation = (AnnotationFS)containingTokens.next();
            openNLPSpans.add(new Span(tokenAnnotation.getBegin() - sentence.getBegin(), tokenAnnotation.getEnd() - sentence.getBegin()));
        }
        Object[] spans = openNLPSpans.toArray(new Span[openNLPSpans.size()]);
        Arrays.sort(spans);
        this.tokenSamples.add(new TokenSample(sentence.getCoveredText(), (Span[])spans));
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void collectionProcessComplete(ProcessTrace arg0) throws ResourceProcessException, IOException {
        TokenizerModel tokenModel;
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Collected " + this.tokenSamples.size() + " token samples.");
        }
        GIS.PRINT_MESSAGES = false;
        Object samples = ObjectStreamUtils.createObjectStream(this.tokenSamples);
        InputStream additionalTrainingDataIn = null;
        OutputStreamWriter samplesOut = null;
        try {
            if (this.additionalTrainingDataFile != null) {
                if (this.mLogger.isLoggable(Level.INFO)) {
                    this.mLogger.log(Level.INFO, "Using addional training data file: " + this.additionalTrainingDataFile);
                }
                additionalTrainingDataIn = new FileInputStream(this.additionalTrainingDataFile);
                TokenSampleStream additionalSamples = new TokenSampleStream((ObjectStream)new PlainTextByLineStream((Reader)new InputStreamReader(additionalTrainingDataIn, this.additionalTrainingDataEncoding)));
                samples = ObjectStreamUtils.createObjectStream((ObjectStream[])new ObjectStream[]{samples, additionalSamples});
            }
            if (this.sampleTraceFile != null) {
                samplesOut = new OutputStreamWriter((OutputStream)new FileOutputStream(this.sampleTraceFile), this.sampleTraceFileEncoding);
                samples = new SampleTraceStream(samples, samplesOut);
            }
            tokenModel = TokenizerME.train((String)this.language, (ObjectStream)samples, (boolean)this.isSkipAlphaNumerics);
        }
        finally {
            if (additionalTrainingDataIn != null) {
                additionalTrainingDataIn.close();
            }
        }
        this.tokenSamples = null;
        File modelFile = new File(this.getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + this.mModelName);
        OpennlpUtil.serialize((BaseModel)tokenModel, modelFile);
    }

    public boolean isStateless() {
        return false;
    }

    public void destroy() {
        this.tokenSamples = null;
    }
}

