/*
 * Decompiled with CFR 0.152.
 */
package kd.ai.gai.core.rag.split;

import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
import kd.ai.gai.core.enuz.repo.RepoChunkDataPreRule;
import kd.ai.gai.core.rag.RepoChunkOperateConfig;
import kd.ai.gai.core.rag.split.ISplit;
import kd.bos.util.StringUtils;

public abstract class AbstractSplitter
implements ISplit {
    public static final String PARAGREPH_SEPARATOR = "\n\n\n";
    public static final String SPLIT_SENTENCE_CHARS = "\u3002?\uff1f!\uff01";
    public static final String SPLIT_LOWER_SENTENCE_CHARS = "\u3002?\uff1f!\uff01,\uff0c;\uff1b";
    public static final Pattern punctuationPattern = Pattern.compile("^[\\p{Punct}\\s\uff0c\u3002\uff01\uff1f\uff1b\uff1a]+$");
    public static final String urlRegex = "(https?://|ftp://|file://|www\\.)(www\\.)?[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
    public static final String emailRegex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9-]+(\\.[a-zA-Z]{2,6}){1,3}";
    public static final Pattern urlPattern = Pattern.compile("(https?://|ftp://|file://|www\\.)(www\\.)?[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]");
    public static final Pattern emailPattern = Pattern.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9-]+(\\.[a-zA-Z]{2,6}){1,3}");

    public static boolean isComplete(String str) {
        if (StringUtils.isNotEmpty((String)str)) {
            return str.endsWith("\u3002") || str.endsWith("?") || str.endsWith("\uff1f") || str.endsWith("!") || str.endsWith("\uff01");
        }
        return false;
    }

    public static String[] paragraphSimpleSplit(String input) {
        return input.split(PARAGREPH_SEPARATOR);
    }

    public static int findLowerSplitfirstIndex(String content) {
        int indexToSplitAt = 0;
        for (int i = 0; i < content.length(); ++i) {
            if (!SPLIT_LOWER_SENTENCE_CHARS.contains(content.charAt(i) + "")) continue;
            indexToSplitAt = i + 1;
            break;
        }
        return indexToSplitAt;
    }

    public static int findLastSymbolIndex(String content) {
        int superIndex = AbstractSplitter.getMax(content.lastIndexOf(12290), content.lastIndexOf(63), content.lastIndexOf(65311), content.lastIndexOf(33), content.lastIndexOf(65281));
        if (superIndex >= 0) {
            return superIndex;
        }
        int secendIndex = AbstractSplitter.getMax(content.lastIndexOf(59), content.lastIndexOf(65307));
        if (secendIndex >= 0) {
            return secendIndex;
        }
        int lowerIndex = AbstractSplitter.getMax(content.lastIndexOf(44), content.lastIndexOf(65292));
        return lowerIndex;
    }

    public static int getMax(int ... ints) {
        int bigger = ints[0];
        for (int i = 1; i < ints.length; ++i) {
            bigger = Math.max(bigger, ints[i]);
        }
        return bigger;
    }

    public static int findLowerLastSplitMaxlenIndex(String content, int maxSplitLen) {
        int indexToSplitAt;
        String findSourceContent = content;
        if (content.length() > maxSplitLen) {
            findSourceContent = content.substring(0, maxSplitLen);
        }
        if ((indexToSplitAt = AbstractSplitter.findLastSymbolIndex(findSourceContent)) >= 0) {
            ++indexToSplitAt;
        }
        return indexToSplitAt;
    }

    public static String textDataPreprocess(String input) {
        String result = input.trim().replaceAll(" +", " ").replaceAll("(\\r?\\n(\\s*\\r?\\n)+)", "\n").replaceAll("\\r\\n?", "\n").replaceAll("\\t+", "\t").replaceAll("\\r+", "\n").replaceAll("\\n+", "\n");
        return result;
    }

    public static String replaceLineBreak(String input) {
        return input.trim().replaceAll("\\r", "\n");
    }

    public static String urlPreprocess(String input) {
        input = emailPattern.matcher(input.trim()).replaceAll("");
        input = urlPattern.matcher(input.trim()).replaceAll("");
        return input;
    }

    public static boolean uselessDataFilter(LinkedList<String> list, int index) {
        String str = list.get(index);
        if (str.isEmpty() || str.trim().isEmpty() || str.matches("^[\\n\\r]+$") || str.matches("^[\\t]+$") || punctuationPattern.matcher(str).matches()) {
            list.remove(index);
            return true;
        }
        return false;
    }

    public static boolean uselessDataFilter(String str) {
        return str.isEmpty() || str.trim().isEmpty() || str.matches("^[\\n\\r]+$") || str.matches("^[\\t]+$") || punctuationPattern.matcher(str).matches();
    }

    public static String dataPreprocessWithRule(String content, RepoChunkOperateConfig repoChunkOperateConfig) {
        List<String> preDataRule = repoChunkOperateConfig.getPreDataRule();
        content = preDataRule.contains(RepoChunkDataPreRule.REPLACE_CONTINUOUS.getVal()) ? AbstractSplitter.textDataPreprocess(content) : AbstractSplitter.replaceLineBreak(content);
        if (preDataRule.contains(RepoChunkDataPreRule.DELETEURL.getVal())) {
            content = AbstractSplitter.urlPreprocess(content);
        }
        return content;
    }

    public static String chunkTextPreprocess(String input) {
        String result = input.trim().replaceFirst("^[\n\r\t ]+", "");
        return result;
    }
}

