/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

public class WordTokenizer
implements Tokenizer {
    private static final List<String> PROTOCOLS = Collections.unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-Z0-9/%$-_.+!*'(),?#~]+");
    private static final Pattern DOMAIN_CHARS = Pattern.compile("[a-zA-Z0-9][a-zA-Z0-9-]+");
    private static final Pattern NO_PROTOCOL_URL = Pattern.compile("([a-zA-Z0-9][a-zA-Z0-9-]+\\.)?([a-zA-Z0-9][a-zA-Z0-9-]+)\\.([a-zA-Z0-9][a-zA-Z0-9-]+)/.*");
    private static final Pattern E_MAIL = Pattern.compile("(?<!:)@?\\b[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\])|(([a-zA-Z\\-0-9]+\\.)+[a-zA-Z]{2,}))\\b");
    private static final String TOKENIZING_CHARACTERS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb\u00a6\u2016\u2223|,.;()[]{}=*#\u2217+\u00d7\u00b7\u00f7<>!?:~/\\\"'\u00ab\u00bb\u201e\u201d\u201c\u2018\u2019`\u00b4\u201b\u2032\u203a\u2039\u2026\u00bf\u00a1\u203c\u2047\u2048\u2049\u2122\u00ae\u203d\u2012\u2013\u2014\u2015\u2500\u3161\u2713\u25cf\u25cb\u25c6\u27a2\u25a0\u25a1\u2605\u274f\u2794\u21b5\u2756\u25aa\u2751\u2022\u2b9a\u2265\u2192\u21fe\u21c9\u21d2\u21e8\u21db\u00b9\u00b2\u00b3\u2070\u2071\u2074\u2075\u2076\u2077\u2078\u2079\t\n\r";

    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    public static boolean isUrl(String token) {
        for (String protocol : WordTokenizer.getProtocols()) {
            if (!token.startsWith(protocol + "://") && !token.startsWith("www.")) continue;
            return true;
        }
        return NO_PROTOCOL_URL.matcher(token).matches();
    }

    public static boolean isEMail(String token) {
        return E_MAIL.matcher(token).matches();
    }

    @Override
    public List<String> tokenize(String text2) {
        ArrayList<String> l = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text2, this.getTokenizingCharacters(), true);
        while (st.hasMoreElements()) {
            l.add(st.nextToken());
        }
        return this.joinEMailsAndUrls(l);
    }

    public String getTokenizingCharacters() {
        return TOKENIZING_CHARACTERS;
    }

    protected List<String> joinEMailsAndUrls(List<String> list) {
        return this.joinUrls(this.joinEMails(list));
    }

    protected List<String> joinEMails(List<String> list) {
        StringBuilder sb = new StringBuilder();
        for (String item : list) {
            sb.append(item);
        }
        String text2 = sb.toString();
        if (text2.contains("@") && E_MAIL.matcher(text2).find()) {
            Matcher matcher = E_MAIL.matcher(text2);
            ArrayList<String> l = new ArrayList<String>();
            int currentPosition = 0;
            int idx = 0;
            while (matcher.find()) {
                int start = matcher.start();
                int end = matcher.end();
                while (currentPosition < end) {
                    if (currentPosition < start) {
                        l.add(list.get(idx));
                    } else if (currentPosition == start) {
                        l.add(matcher.group());
                    }
                    currentPosition += list.get(idx).length();
                    ++idx;
                }
            }
            if (currentPosition < text2.length()) {
                l.addAll(list.subList(idx, list.size()));
            }
            return l;
        }
        return list;
    }

    protected List<String> joinUrls(List<String> l) {
        ArrayList<String> newList = new ArrayList<String>();
        boolean inUrl = false;
        StringBuilder url2 = new StringBuilder();
        String urlQuote = null;
        for (int i2 = 0; i2 < l.size(); ++i2) {
            if (this.urlStartsAt(i2, l) && !inUrl) {
                inUrl = true;
                if (i2 - 1 >= 0) {
                    urlQuote = l.get(i2 - 1);
                }
                url2.append(l.get(i2));
                continue;
            }
            if (inUrl && this.urlEndsAt(i2, l, urlQuote)) {
                inUrl = false;
                urlQuote = null;
                newList.add(url2.toString());
                url2.setLength(0);
                newList.add(l.get(i2));
                continue;
            }
            if (inUrl) {
                url2.append(l.get(i2));
                continue;
            }
            newList.add(l.get(i2));
        }
        if (url2.length() > 0) {
            newList.add(url2.toString());
        }
        return newList;
    }

    private boolean urlStartsAt(int i2, List<String> l) {
        String nnToken;
        String nToken;
        String token = l.get(i2);
        if (this.isProtocol(token) && l.size() > i2 + 3) {
            nToken = l.get(i2 + 1);
            nnToken = l.get(i2 + 2);
            String nnnToken = l.get(i2 + 3);
            if (nToken.equals(":") && nnToken.equals("/") && nnnToken.equals("/")) {
                return true;
            }
        }
        if (l.size() > i2 + 1) {
            nToken = l.get(i2);
            nnToken = l.get(i2 + 1);
            if (nToken.equals("www") && nnToken.equals(".")) {
                return true;
            }
        }
        if (l.size() > i2 + 3 && l.get(i2 + 1).equals(".") && l.get(i2 + 3).equals("/") && DOMAIN_CHARS.matcher(token).matches() && DOMAIN_CHARS.matcher(l.get(i2 + 2)).matches()) {
            return true;
        }
        return l.size() > i2 + 5 && l.get(i2 + 1).equals(".") && l.get(i2 + 3).equals(".") && l.get(i2 + 5).equals("/") && DOMAIN_CHARS.matcher(token).matches() && DOMAIN_CHARS.matcher(l.get(i2 + 2)).matches() && DOMAIN_CHARS.matcher(l.get(i2 + 4)).matches();
    }

    private boolean isProtocol(String token) {
        return PROTOCOLS.contains(token);
    }

    private boolean urlEndsAt(int i2, List<String> l, String urlQuote) {
        String nextToken;
        String token = l.get(i2);
        if (StringTools.isWhitespace(token) || token.equals(")") || token.equals("]")) {
            return true;
        }
        return l.size() > i2 + 1 ? (StringTools.isWhitespace(nextToken = l.get(i2 + 1)) || StringUtils.equalsAny((CharSequence)nextToken, (CharSequence[])new CharSequence[]{"\"", "\u00bb", "\u00ab", "\u2018", "\u2019", "\u201c", "\u201d", "'", "."})) && (StringUtils.equalsAny((CharSequence)token, (CharSequence[])new CharSequence[]{".", ",", ";", ":", "!", "?"}) || token.equals(urlQuote)) || !URL_CHARS.matcher(token).matches() : !URL_CHARS.matcher(token).matches() || token.equals(".") || token.equals(urlQuote);
    }
}

