package kd.ai.gai.core.rag.parser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import kd.ai.gai.core.domain.dto.Chunk;
import kd.ai.gai.core.enuz.repo.SpliteType;
import kd.ai.gai.core.rag.Sentence;
import kd.ai.gai.core.rag.SplitConfig;
import kd.ai.gai.core.rag.chunk.ChunkInput;
import kd.ai.gai.core.rag.split.AbstractSplitter;
import kd.ai.gai.core.rag.split.SplitFactory;
import kd.ai.gai.core.service.ChunkService;
import kd.bos.fileservice.FileServiceFactory;
import kd.bos.logging.Log;
import kd.bos.logging.LogFactory;
import kd.bos.util.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

/* loaded from: input_file:kd/ai/gai/core/rag/parser/PdfBaseParser.class */
public class PdfBaseParser extends AbstractParser {
    private static Log LOGGER = LogFactory.getLog(PdfBaseParser.class);
    private AbstractSplitter splitter = SplitFactory.getSplit(SpliteType.SENTENCE);

    @Override // kd.ai.gai.core.rag.parser.IParser
    public boolean opreate(ChunkInput chunkInput) throws IOException {
        LinkedList linkedList = new LinkedList();
        PDDocument load = PDDocument.load(FileServiceFactory.getAttachmentFileService().getInputStream(chunkInput.getFilePath()));
        Throwable th = null;
        try {
            try {
                PDFTextStripper pDFTextStripper = new PDFTextStripper();
                pDFTextStripper.setSortByPosition(true);
                int length = chunkInput.getChunkConfig().getLength();
                long repositoryId = chunkInput.getRepositoryId();
                long fileId = chunkInput.getFileId();
                for (int i = 1; i <= load.getNumberOfPages(); i++) {
                    pDFTextStripper.setStartPage(i);
                    pDFTextStripper.setEndPage(i);
                    String text = pDFTextStripper.getText(load);
                    if (StringUtils.isNotEmpty(text)) {
                        int i2 = 1;
                        ArrayList<Sentence> split = this.splitter.split(i, text);
                        if (!linkedList.isEmpty() && !split.isEmpty()) {
                            int size = linkedList.size() - 1;
                            Chunk chunk = (Chunk) linkedList.get(size);
                            if (!chunk.isChunkComplete() || (chunk.getChunk() + split.get(0).getContent()).length() <= length) {
                                i2 = chunk.getOrder();
                                split.add(0, new Sentence(chunk.getPage(), chunk.getChunk(), chunk.isChunkComplete()));
                                linkedList.remove(size);
                                ChunkService.batchConditionInsert(linkedList);
                            }
                        }
                        if (!split.isEmpty()) {
                            linkedList.addAll(this.splitter.merge(new SplitConfig(repositoryId, fileId, i2, length), split));
                        }
                    } else {
                        LOGGER.info("file:{} ,第{}页非可解析内容，读取为空", chunkInput.getFilePath(), Integer.valueOf(i));
                    }
                }
                ChunkService.batchInsert(linkedList);
                if (load != null) {
                    if (0 != 0) {
                        try {
                            load.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        load.close();
                    }
                }
                LOGGER.info("file:{}文件chunk处理完成,共{}chunk", chunkInput.getFilePath(), Integer.valueOf(linkedList.size()));
                return true;
            } finally {
            }
        } catch (Throwable th3) {
            if (load != null) {
                if (th != null) {
                    try {
                        load.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    load.close();
                }
            }
            throw th3;
        }
    }
}
