/*
 * Decompiled with CFR 0.152.
 */
package kd.ai.gai.core.rag.parser;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import kd.ai.gai.core.domain.dto.Chunk;
import kd.ai.gai.core.enuz.repo.SpliteType;
import kd.ai.gai.core.rag.Sentence;
import kd.ai.gai.core.rag.SplitConfig;
import kd.ai.gai.core.rag.chunk.ChunkInput;
import kd.ai.gai.core.rag.parser.AbstractParser;
import kd.ai.gai.core.rag.split.AbstractSplitter;
import kd.ai.gai.core.rag.split.SplitFactory;
import kd.ai.gai.core.service.ChunkService;
import kd.bos.fileservice.FileServiceFactory;
import kd.bos.logging.Log;
import kd.bos.logging.LogFactory;
import kd.bos.util.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class PdfBaseParser
extends AbstractParser {
    private static Log LOGGER = LogFactory.getLog(PdfBaseParser.class);
    private AbstractSplitter splitter = SplitFactory.getSplit(SpliteType.SENTENCE);

    @Override
    public boolean opreate(ChunkInput file) throws IOException {
        LinkedList<Chunk> pdfAllChunks = new LinkedList<Chunk>();
        try (PDDocument document = PDDocument.load((InputStream)FileServiceFactory.getAttachmentFileService().getInputStream(file.getFilePath()));){
            PDFTextStripper pdfStripper = new PDFTextStripper();
            pdfStripper.setSortByPosition(true);
            int maxChunkLen = file.getChunkConfig().getLength();
            long repositoryId = file.getRepositoryId();
            long fileId = file.getFileId();
            for (int page = 1; page <= document.getNumberOfPages(); ++page) {
                pdfStripper.setStartPage(page);
                pdfStripper.setEndPage(page);
                String pageContent = pdfStripper.getText(document);
                if (StringUtils.isNotEmpty((String)pageContent)) {
                    int perPageLastChunkIndex;
                    Chunk perPageLastChunk;
                    int startChunkOrder = 1;
                    ArrayList<Sentence> sentences = this.splitter.split(page, pageContent);
                    if (!(pdfAllChunks.isEmpty() || sentences.isEmpty() || (perPageLastChunk = (Chunk)pdfAllChunks.get(perPageLastChunkIndex = pdfAllChunks.size() - 1)).isChunkComplete() && (perPageLastChunk.getChunk() + sentences.get(0).getContent()).length() > maxChunkLen)) {
                        startChunkOrder = perPageLastChunk.getOrder();
                        Sentence sentence = new Sentence(perPageLastChunk.getPage(), perPageLastChunk.getChunk(), perPageLastChunk.isChunkComplete());
                        sentences.add(0, sentence);
                        pdfAllChunks.remove(perPageLastChunkIndex);
                        ChunkService.batchConditionInsert(pdfAllChunks);
                    }
                    if (sentences.isEmpty()) continue;
                    SplitConfig splitConfig = new SplitConfig(repositoryId, fileId, startChunkOrder, maxChunkLen);
                    LinkedList<Chunk> currenPageChunks = this.splitter.merge(splitConfig, sentences);
                    pdfAllChunks.addAll(currenPageChunks);
                    continue;
                }
                LOGGER.info("file:{} ,\u7b2c{}\u9875\u975e\u53ef\u89e3\u6790\u5185\u5bb9\uff0c\u8bfb\u53d6\u4e3a\u7a7a", (Object)file.getFilePath(), (Object)page);
            }
            ChunkService.batchInsert(pdfAllChunks);
        }
        LOGGER.info("file:{}\u6587\u4ef6chunk\u5904\u7406\u5b8c\u6210,\u5171{}chunk", (Object)file.getFilePath(), (Object)pdfAllChunks.size());
        return true;
    }
}

