/*
 * Decompiled with CFR 0.152.
 */
package kd.bos.gptas.common.splitter;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import kd.bos.gptas.common.splitter.MarkdownHeaderTextSplitter;

public class LargeMarkdownSplitter
extends MarkdownHeaderTextSplitter {
    private final int maxSize;
    private final int overSize;

    public LargeMarkdownSplitter() {
        this.maxSize = 400;
        this.overSize = 100;
    }

    public LargeMarkdownSplitter(List<MarkdownHeaderTextSplitter.HeaderConfig> headersToSplitOn, int maxSize, int overSize) {
        super(headersToSplitOn);
        this.maxSize = maxSize;
        this.overSize = overSize;
    }

    @Override
    public List<MarkdownHeaderTextSplitter.Document> splitText(String text) {
        List<MarkdownHeaderTextSplitter.Document> headerBasedChunks = super.splitText(text);
        ArrayList<MarkdownHeaderTextSplitter.Document> refinedChunks = new ArrayList<MarkdownHeaderTextSplitter.Document>();
        for (MarkdownHeaderTextSplitter.Document chunk : headerBasedChunks) {
            if (chunk.getPageContent().length() > this.maxSize - this.overSize) {
                refinedChunks.addAll(this.splitLargeChunk(chunk));
                continue;
            }
            refinedChunks.add(chunk);
        }
        return refinedChunks;
    }

    List<MarkdownHeaderTextSplitter.Document> splitLargeChunk(MarkdownHeaderTextSplitter.Document chunk) {
        String content = chunk.getPageContent();
        ArrayList<MarkdownHeaderTextSplitter.Document> subChunks = new ArrayList<MarkdownHeaderTextSplitter.Document>(content.length());
        int position = 0;
        boolean isFirstChunk = true;
        while (position < content.length()) {
            MarkdownHeaderTextSplitter.Document prevDoc;
            String overlapContent;
            StandardChunk standardChunk = this.getStandardChunk(content, position, isFirstChunk ? this.maxSize : this.maxSize - this.overSize);
            MarkdownHeaderTextSplitter.Document doc = new MarkdownHeaderTextSplitter.Document(standardChunk.content, new HashMap<String, String>(chunk.getMetadata()));
            doc.getMetadata().put("chunk_sequence", String.valueOf(subChunks.size() + 1));
            subChunks.add(doc);
            if (!isFirstChunk && (overlapContent = this.getOverlapContent((prevDoc = (MarkdownHeaderTextSplitter.Document)subChunks.get(subChunks.size() - 2)).getPageContent())) != null) {
                doc.setPageContent(overlapContent + " " + doc.getPageContent());
            }
            position = standardChunk.nextPosition;
            isFirstChunk = false;
        }
        return subChunks;
    }

    private StandardChunk getStandardChunk(String content, int startPosition, int maxLength) {
        int splitPoint;
        int endPosition = startPosition + maxLength;
        if (endPosition > content.length()) {
            endPosition = content.length();
        }
        if ((splitPoint = this.findLastDelimiterInOrder(content, startPosition, endPosition)) <= startPosition) {
            splitPoint = this.findSafeUtf8Break(content, endPosition);
        }
        return new StandardChunk(content.substring(startPosition, splitPoint).trim(), splitPoint);
    }

    private int findLastDelimiterInOrder(String content, int startPos, int endPos) {
        for (Delimiter delimiter : Delimiter.values()) {
            int splitPoint = this.findLastDelimiter(content, startPos, endPos, delimiter);
            if (splitPoint <= startPos) continue;
            return splitPoint;
        }
        return -1;
    }

    private String getOverlapContent(String content) {
        if (content == null || content.length() <= this.overSize) {
            return content;
        }
        String searchContent = content.substring(Math.max(0, content.length() - this.overSize));
        int startPoint = this.findFirstDelimiterInOrder(searchContent);
        if (startPoint >= 0 && startPoint != searchContent.length()) {
            return searchContent.substring(startPoint).trim();
        }
        return searchContent.trim();
    }

    private int findFirstDelimiterInOrder(String content) {
        for (Delimiter delimiter : Delimiter.values()) {
            for (int i = 0; i < content.length(); ++i) {
                if (!delimiter.matches(content.charAt(i))) continue;
                return i + 1;
            }
        }
        return -1;
    }

    private int findLastDelimiter(String content, int startPos, int endPos, Delimiter delimiter) {
        for (int i = Math.min(endPos, content.length()) - 1; i >= startPos; --i) {
            if (!delimiter.matches(content.charAt(i))) continue;
            return i + 1;
        }
        return -1;
    }

    private int findSafeUtf8Break(String content, int targetPos) {
        if (targetPos >= content.length()) {
            return content.length();
        }
        while (targetPos > 0 && !this.isUtf8CharBoundary(content.charAt(targetPos))) {
            --targetPos;
        }
        return targetPos;
    }

    private boolean isUtf8CharBoundary(char c) {
        return c < '\u0080' || c > '\u00bf';
    }

    private static class StandardChunk {
        final String content;
        final int nextPosition;

        StandardChunk(String content, int nextPosition) {
            this.content = content;
            this.nextPosition = nextPosition;
        }
    }

    private static enum Delimiter {
        LINE_BREAK(Character.valueOf('\n')),
        SENTENCE_END(Character.valueOf('\u3002'), Character.valueOf('\uff01'), Character.valueOf('\uff1f'), Character.valueOf('.'), Character.valueOf('!'), Character.valueOf('?')),
        SOFT_BREAK(Character.valueOf('\uff0c'), Character.valueOf('\uff1b'), Character.valueOf(','), Character.valueOf(';'), Character.valueOf('\u3001')),
        SPACE(Character.valueOf(' '));

        private final Set<Character> chars;

        private Delimiter(Character ... chars) {
            this.chars = new HashSet<Character>(Arrays.asList(chars));
        }

        boolean matches(char c) {
            return this.chars.contains(Character.valueOf(c));
        }
    }
}

