/*
 * Decompiled with CFR 0.152.
 */
package com.github.chen0040.data.text;

import com.github.chen0040.data.text.Tokenizer;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class BasicTokenizer
implements Tokenizer,
Serializable {
    private static final String regexLetterNumber = "[a-zA-Z0-9]";
    private static final String regexNotLetterNumber = "[^a-zA-Z0-9]";
    private static final String regexSeparator = "[\\?!()\";/\\|`]";
    private static final String regexClitics = "'|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't";
    private static final List<String> abbrList = Arrays.asList("Co.", "Corp.", "vs.", "e.g.", "etc.", "ex.", "cf.", "eg.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sept.", "Oct.", "Nov.", "Dec.", "jan.", "feb.", "mar.", "apr.", "jun.", "jul.", "aug.", "sept.", "oct.", "nov.", "dec.", "ed.", "eds.", "repr.", "trans.", "vol.", "vols.", "rev.", "est.", "b.", "m.", "bur.", "d.", "r.", "M.", "Dept.", "MM.", "U.", "Mr.", "Jr.", "Ms.", "Mme.", "Mrs.", "Dr.", "Ph.D.");
    private static final long serialVersionUID = -999803747111655623L;
    private static BasicTokenizer tokenizer;

    @Override
    public List<String> tokenize(String str) {
        ArrayList<String> tokenList = new ArrayList<String>();
        str = str.replaceAll("\\t", " ");
        str = str.replaceAll("([\\?!()\";/\\|`])", " $1 ");
        str = str.replaceAll("([^\\s]),", "$1 ,");
        str = str.replaceAll(",([^\\s])", " , $1");
        str = str.replaceAll("^(')", "$1 ");
        str = str.replaceAll("([^a-zA-Z0-9])'", "$1 '");
        str = str.replaceAll("('|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't)$", " $1");
        str = str.replaceAll("('|:|-|'S|'D|'M|'LL|'RE|'VE|N'T|'s|'d|'m|'ll|'re|'ve|n't)([^a-zA-Z0-9])", " $1 $2");
        String[] words = str.trim().split("\\s+");
        Pattern p1 = Pattern.compile(".*[a-zA-Z0-9]\\.");
        Pattern p2 = Pattern.compile("^([A-Za-z]\\.([A-Za-z]\\.)+|[A-Z][bcdfghj-nptvxz]+\\.)$");
        for (String word : words) {
            Matcher m1 = p1.matcher(word);
            Matcher m2 = p2.matcher(word);
            if (m1.matches() && !abbrList.contains(word) && !m2.matches()) {
                tokenList.add(word.substring(0, word.length() - 1));
                tokenList.add(word.substring(word.length() - 1));
                continue;
            }
            tokenList.add(word);
        }
        return tokenList;
    }

    private static BasicTokenizer getTokenizer() {
        if (tokenizer == null) {
            tokenizer = new BasicTokenizer();
        }
        return tokenizer;
    }

    public static List<String> doTokenize(String text) {
        return BasicTokenizer.getTokenizer().tokenize(text);
    }

    public static List<String> doTokenize(List<String> text) {
        ArrayList<String> result = new ArrayList<String>();
        for (int i = 0; i < text.size(); ++i) {
            result.addAll(BasicTokenizer.doTokenize(text.get(i)));
        }
        return result;
    }
}

