/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator;
import edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator;
import edu.stanford.nlp.pipeline.LanguageInfo;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

public class TokenizerAnnotator
implements Annotator {
    private static final Redwood.RedwoodChannels log = Redwood.channels(TokenizerAnnotator.class);
    public static final String EOL_PROPERTY = "tokenize.keepeol";
    private final boolean VERBOSE;
    private final TokenizerFactory<CoreLabel> factory;
    private final boolean useSegmenter;
    private final Annotator segmenterAnnotator;

    public TokenizerAnnotator() {
        this(false);
    }

    private static String computeExtraOptions(Properties properties) {
        String nlsbString;
        WordToSentenceProcessor.NewlineIsSentenceBreak nlsb;
        String extraOptions = null;
        boolean keepNewline = Boolean.valueOf(properties.getProperty("ssplit.eolonly", "false"));
        String hasSsplit = properties.getProperty("annotators");
        if (hasSsplit != null && hasSsplit.contains("ssplit") && !Boolean.parseBoolean(properties.getProperty("ssplit.isOneSentence")) && (nlsb = WordToSentenceProcessor.stringToNewlineIsSentenceBreak(nlsbString = properties.getProperty("ssplit.newlineIsSentenceBreak", "never"))) != WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER) {
            keepNewline = true;
        }
        if (keepNewline) {
            extraOptions = "tokenizeNLs,";
        }
        return extraOptions;
    }

    public TokenizerAnnotator(Properties properties) {
        this(false, properties, TokenizerAnnotator.computeExtraOptions(properties));
    }

    public TokenizerAnnotator(boolean verbose) {
        this(verbose, TokenizerType.English);
    }

    public TokenizerAnnotator(String lang) {
        this(true, lang, null);
    }

    public TokenizerAnnotator(boolean verbose, TokenizerType lang) {
        this(verbose, lang.toString());
    }

    public TokenizerAnnotator(boolean verbose, String lang) {
        this(verbose, lang, null);
    }

    public TokenizerAnnotator(boolean verbose, String lang, String options) {
        this(verbose, lang == null ? null : PropertiesUtils.asProperties("tokenize.language", lang), options);
    }

    public TokenizerAnnotator(boolean verbose, Properties props) {
        this(verbose, props, null);
    }

    /*
     * Enabled aggressive block sorting
     */
    public TokenizerAnnotator(boolean verbose, Properties props, String options) {
        if (props == null) {
            props = new Properties();
        }
        if (props.getProperty("tokenize.language") != null && LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language"))) {
            this.useSegmenter = true;
            if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) {
                this.segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
            } else {
                if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) != LanguageInfo.HumanLanguage.CHINESE) {
                    this.segmenterAnnotator = null;
                    throw new RuntimeException("No segmenter implemented for: " + (Object)((Object)LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language"))));
                }
                this.segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
            }
        } else {
            this.useSegmenter = false;
            this.segmenterAnnotator = null;
        }
        this.VERBOSE = PropertiesUtils.getBool(props, "tokenize.verbose", verbose);
        TokenizerType type = TokenizerType.getTokenizerType(props);
        this.factory = TokenizerAnnotator.initFactory(type, props, options);
    }

    private static TokenizerFactory<CoreLabel> initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException {
        TokenizerFactory<CoreLabel> factory;
        String options = props.getProperty("tokenize.options", null);
        if (options == null) {
            options = type.getDefaultOptions();
        }
        if (extraOptions != null) {
            options = extraOptions.endsWith(",") ? extraOptions + options : extraOptions + ',' + options;
        }
        switch (type) {
            case Arabic: 
            case Chinese: {
                factory = null;
                break;
            }
            case Spanish: {
                factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case French: {
                factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case Whitespace: {
                boolean eolIsSignificant = Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false"));
                eolIsSignificant = eolIsSignificant || Boolean.valueOf(props.getProperty("ssplit.eolonly", "false")) != false;
                factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }
            case English: 
            case German: {
                factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case Unspecified: {
                log.info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            default: {
                throw new IllegalArgumentException("No valid tokenizer type provided.\nUse -tokenize.language, -tokenize.class, or -tokenize.whitespace \nto specify a tokenizer.");
            }
        }
        return factory;
    }

    public Tokenizer<CoreLabel> getTokenizer(Reader r) {
        return this.factory.getTokenizer(r);
    }

    @Override
    public void annotate(Annotation annotation) {
        if (this.VERBOSE) {
            log.info("Tokenizing ... ");
        }
        if (this.useSegmenter) {
            this.segmenterAnnotator.annotate(annotation);
            return;
        }
        if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
            String text = (String)annotation.get(CoreAnnotations.TextAnnotation.class);
            StringReader r = new StringReader(text);
            List<CoreLabel> tokens = this.getTokenizer(r).tokenize();
            annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
            if (this.VERBOSE) {
                log.info("done.");
                log.info("Tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class));
            }
        } else {
            throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
        }
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requires() {
        return Collections.emptySet();
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
        return new HashSet<Class<? extends CoreAnnotation>>(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class));
    }

    public static enum TokenizerType {
        Unspecified(null, null, "invertible,ptb3Escaping=true"),
        Arabic("ar", null, ""),
        Chinese("zh", null, ""),
        Spanish("es", "SpanishTokenizer", "invertible,ptb3Escaping=true,splitAll=true"),
        English("en", "PTBTokenizer", "invertible,ptb3Escaping=true"),
        German("de", null, "invertible,ptb3Escaping=true"),
        French("fr", "FrenchTokenizer", ""),
        Whitespace(null, "WhitespaceTokenizer", "");

        private final String abbreviation;
        private final String className;
        private final String defaultOptions;
        private static final Map<String, TokenizerType> nameToTokenizerMap;
        private static final Map<String, TokenizerType> classToTokenizerMap;

        private TokenizerType(String abbreviation, String className, String defaultOptions) {
            this.abbreviation = abbreviation;
            this.className = className;
            this.defaultOptions = defaultOptions;
        }

        public String getDefaultOptions() {
            return this.defaultOptions;
        }

        private static Map<String, TokenizerType> initializeNameMap() {
            Map<String, TokenizerType> map = Generics.newHashMap();
            for (TokenizerType type : TokenizerType.values()) {
                if (type.abbreviation != null) {
                    map.put(type.abbreviation.toUpperCase(), type);
                }
                map.put(type.toString().toUpperCase(), type);
            }
            return Collections.unmodifiableMap(map);
        }

        private static Map<String, TokenizerType> initializeClassMap() {
            Map<String, TokenizerType> map = Generics.newHashMap();
            for (TokenizerType type : TokenizerType.values()) {
                if (type.className == null) continue;
                map.put(type.className.toUpperCase(), type);
            }
            return Collections.unmodifiableMap(map);
        }

        public static TokenizerType getTokenizerType(Properties props) {
            String tokClass = props.getProperty("tokenize.class", null);
            boolean whitespace = Boolean.valueOf(props.getProperty("tokenize.whitespace", "false"));
            String language = props.getProperty("tokenize.language", null);
            if (whitespace) {
                return Whitespace;
            }
            if (tokClass != null) {
                TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase());
                if (type == null) {
                    throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass);
                }
                return type;
            }
            if (language != null) {
                TokenizerType type = nameToTokenizerMap.get(language.toUpperCase());
                if (type == null) {
                    throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language);
                }
                return type;
            }
            return Unspecified;
        }

        static {
            nameToTokenizerMap = TokenizerType.initializeNameMap();
            classToTokenizerMap = TokenizerType.initializeClassMap();
        }
    }
}

