/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ChineseSegmenterAnnotator
implements Annotator {
    private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseSegmenterAnnotator.class);
    private static final String DEFAULT_MODEL_NAME = "segment";
    private static final String DEFAULT_SEG_LOC = "/u/nlp/data/gale/segtool/stanford-seg/classifiers-2010/05202008-ctb6.processed-chris6.lex.gz";
    private static final String DEFAULT_SER_DICTIONARY = "/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz";
    private static final String DEFAULT_SIGHAN_CORPORA_DICT = "/u/nlp/data/gale/segtool/stanford-seg/releasedata";
    private final AbstractSequenceClassifier<?> segmenter;
    private final boolean VERBOSE;
    private final boolean tokenizeNewline;
    private static final Pattern xmlPattern = Pattern.compile("<([!?][A-Za-z-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:.-]*([ ]+([A-Za-z][A-Za-z0-9_:.-]*|[A-Za-z][A-Za-z0-9_:.-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z][A-Za-z0-9_:.-]*)))*[ ]*/?|/[A-Za-z][A-Za-z0-9_:.-]*)[ ]*>");

    public ChineseSegmenterAnnotator() {
        this(DEFAULT_SEG_LOC, false);
    }

    public ChineseSegmenterAnnotator(boolean verbose) {
        this(DEFAULT_SEG_LOC, verbose);
    }

    public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
        this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
    }

    public ChineseSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) {
        this(DEFAULT_MODEL_NAME, PropertiesUtils.asProperties("segment.serDictionary", serDictionary, "segment.sighanCorporaDict", sighanCorporaDict, "segment.verbose", Boolean.toString(verbose), "segment.model", segLoc));
    }

    public ChineseSegmenterAnnotator(String name, Properties props) {
        String model = null;
        Properties modelProps = new Properties();
        String desiredKey = name + '.';
        for (String key : props.stringPropertyNames()) {
            if (!key.startsWith(desiredKey)) continue;
            String modelKey = key.substring(desiredKey.length());
            if (modelKey.equals("model")) {
                model = props.getProperty(key);
                continue;
            }
            modelProps.setProperty(modelKey, props.getProperty(key));
        }
        this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false);
        if (model == null) {
            throw new RuntimeException("Expected a property " + name + ".model");
        }
        if (this.VERBOSE) {
            log.info("Loading Segmentation Model ... ");
        }
        try {
            this.segmenter = CRFClassifier.getClassifier(model, modelProps);
        }
        catch (RuntimeException e) {
            throw e;
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
        this.tokenizeNewline = !props.getProperty("ssplit.newlineIsSentenceBreak", "never").equals("never") || Boolean.valueOf(props.getProperty("ssplit.eolonly", "false")) != false;
    }

    @Override
    public void annotate(Annotation annotation) {
        List sentences;
        if (this.VERBOSE) {
            log.info("Adding Segmentation annotation ... ");
        }
        if ((sentences = (List)annotation.get(CoreAnnotations.SentencesAnnotation.class)) != null) {
            for (CoreMap sentence : sentences) {
                this.doOneSentence(sentence);
            }
        } else {
            this.doOneSentence(annotation);
        }
    }

    private void doOneSentence(CoreMap annotation) {
        this.splitCharacters(annotation);
        this.runSegmentation(annotation);
    }

    private void splitCharacters(CoreMap annotation) {
        int cpCharCount;
        String origText = (String)annotation.get(CoreAnnotations.TextAnnotation.class);
        boolean seg = true;
        ArrayList<CoreLabel> charTokens = new ArrayList<CoreLabel>();
        int length = origText.length();
        int xmlStartOffset = Integer.MAX_VALUE;
        int xmlEndOffset = -1;
        Matcher m = xmlPattern.matcher(origText);
        if (m.find()) {
            xmlStartOffset = m.start();
            xmlEndOffset = m.end();
        }
        for (int offset = 0; offset < length; offset += cpCharCount) {
            int cp = origText.codePointAt(offset);
            cpCharCount = Character.charCount(cp);
            CoreLabel wi = new CoreLabel();
            String charString = origText.substring(offset, offset + cpCharCount);
            if (offset == xmlEndOffset && (m = xmlPattern.matcher(origText)).find(offset)) {
                xmlStartOffset = m.start();
                xmlEndOffset = m.end();
            }
            boolean skipCharacter = false;
            boolean isXMLCharacter = false;
            if (offset == xmlStartOffset) {
                seg = true;
                isXMLCharacter = true;
            } else if (offset > xmlStartOffset && offset < xmlEndOffset) {
                seg = false;
                isXMLCharacter = true;
            } else if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
                seg = true;
                skipCharacter = !this.tokenizeNewline || System.lineSeparator().indexOf(charString) < 0 && !charString.equals("\n");
            }
            if (skipCharacter) continue;
            wi.set(CoreAnnotations.ChineseCharAnnotation.class, charString);
            if (seg) {
                wi.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
            } else {
                wi.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
            }
            if (isXMLCharacter) {
                if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
                    wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "whitespace");
                } else if (offset == xmlStartOffset) {
                    wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "beginning");
                } else {
                    wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "1");
                }
            } else {
                wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "0");
            }
            wi.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
            wi.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset + cpCharCount);
            charTokens.add(wi);
            seg = false;
        }
        annotation.set(SegmenterCoreAnnotations.CharactersAnnotation.class, charTokens);
    }

    private void runSegmentation(CoreMap annotation) {
        List<String> words;
        String text = (String)annotation.get(CoreAnnotations.TextAnnotation.class);
        List sentChars = (List)annotation.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
        ArrayList<CoreLabel> tokens = new ArrayList<CoreLabel>();
        annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
        if (!this.tokenizeNewline) {
            text = text.replaceAll("[\r\n]", "");
            words = this.segmenter.segmentString(text);
        } else {
            String[] lines = text.split(String.format("((?<=%1$s)|(?=%1$s)|(?<=\n)|(?=\n))", System.lineSeparator()));
            words = new ArrayList<String>();
            for (String line : lines) {
                if (line.equals(System.lineSeparator()) || line.equals("\n")) {
                    words.add(line);
                    continue;
                }
                words.addAll(this.segmenter.segmentString(line));
            }
        }
        if (this.VERBOSE) {
            log.info(text + "--->" + words);
        }
        int pos = 0;
        StringBuilder xmlbuffer = new StringBuilder();
        int xmlbegin = -1;
        for (String w : words) {
            CoreLabel fl = (CoreLabel)sentChars.get(pos);
            if ((((String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals("0") || ((String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals("beginning")) && xmlbuffer.length() > 0) {
                String xmltag = xmlbuffer.toString();
                CoreLabel token = new CoreLabel();
                token.setWord(xmltag);
                token.setValue(xmltag);
                token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, xmlbegin);
                CoreLabel fl1 = (CoreLabel)sentChars.get(pos - 1);
                token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
                tokens.add(token);
                xmlbegin = -1;
                xmlbuffer = new StringBuilder();
            }
            if (!((String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals("0")) {
                while (((String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals("whitespace")) {
                    xmlbuffer.append(" ");
                    fl = (CoreLabel)sentChars.get(++pos);
                }
                xmlbuffer.append(w);
                pos += w.length();
                if (xmlbegin >= 0) continue;
                xmlbegin = (Integer)fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
                continue;
            }
            fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
            if (w.isEmpty()) continue;
            CoreLabel token = new CoreLabel();
            token.setWord(w);
            token.setValue(w);
            token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
            fl = (CoreLabel)sentChars.get((pos += w.length()) - 1);
            token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
            tokens.add(token);
        }
        if (xmlbuffer.length() > 0) {
            String xmltag = xmlbuffer.toString();
            CoreLabel token = new CoreLabel();
            token.setWord(xmltag);
            token.setValue(xmltag);
            token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, xmlbegin);
            CoreLabel fl1 = (CoreLabel)sentChars.get(pos - 1);
            token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
            tokens.add(token);
        }
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requires() {
        return Collections.emptySet();
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
        return new HashSet<Class<? extends CoreAnnotation>>(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class));
    }
}

