/*
 * Decompiled with CFR 0.152.
 */
package kd.bos.gptas.kmbase.parser;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import kd.bos.gptas.api.km.split.SplitConfig;

public class Cleaner {
    private final SplitConfig splitConfig;

    public Cleaner(SplitConfig splitConfig) {
        this.splitConfig = splitConfig;
    }

    public String clean(String text) {
        Pattern pattern;
        text = text.replaceAll("<\\|", "<");
        text = text.replaceAll("\\|>", ">");
        text = text.replaceAll("[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F\\xEF\\xBF\\xBE]", "");
        if ((text = text.replaceAll("\ufffe", "")).startsWith("\ufeff")) {
            text = text.substring(1);
        }
        if (this.splitConfig.isCleanSpecSym()) {
            pattern = Pattern.compile("[\\r\\n]{3,}");
            text = pattern.matcher(text).replaceAll("\n\n");
            pattern = Pattern.compile("[\\t\\f\\r\\x20\\u00a0\\u1680\\u180e\\u2000-\\u200a\\u202f\\u205f\\u3000]{2,}");
            text = pattern.matcher(text).replaceAll(" ");
        }
        if (this.splitConfig.isCleanEmailUrl()) {
            pattern = Pattern.compile("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+)");
            text = pattern.matcher(text).replaceAll("");
            Pattern markdownImagePattern = Pattern.compile("\\[.*?\\]\\((https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]+[\\s\\w]+)\\)");
            Matcher matcher = markdownImagePattern.matcher(text);
            ArrayList<String> placeholders = new ArrayList<String>();
            StringBuffer sb = new StringBuffer();
            while (matcher.find()) {
                String url = matcher.group(1);
                String placeholder = "__MARKDOWN_IMAGE_URL_" + placeholders.size() + "__";
                placeholders.add(url);
                matcher.appendReplacement(sb, matcher.group(0).replace(url, placeholder));
            }
            matcher.appendTail(sb);
            text = sb.toString();
            Pattern urlPattern = Pattern.compile("https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]+");
            text = urlPattern.matcher(text).replaceAll("");
            for (int i = 0; i < placeholders.size(); ++i) {
                text = text.replace("__MARKDOWN_IMAGE_URL_" + i + "__", (CharSequence)placeholders.get(i));
            }
        }
        return text;
    }
}

