/*
 * Decompiled with CFR 0.152.
 */
package kd.bos.gptas.common.splitter.example;

import java.util.Arrays;
import java.util.List;
import kd.bos.gptas.common.splitter.LargeMarkdownSplitter;
import kd.bos.gptas.common.splitter.MarkdownHeaderTextSplitter;

public class LargeMarkdownSplitterExample {
    public static void main(String[] args) {
        List<MarkdownHeaderTextSplitter.HeaderConfig> headers = Arrays.asList(new MarkdownHeaderTextSplitter.HeaderConfig("#", "header_1"), new MarkdownHeaderTextSplitter.HeaderConfig("##", "header_2"), new MarkdownHeaderTextSplitter.HeaderConfig("###", "header_3"));
        LargeMarkdownSplitter splitter = new LargeMarkdownSplitter(headers, 400, 100);
        String markdownText = LargeMarkdownSplitterExample.getText();
        List<MarkdownHeaderTextSplitter.Document> documents = ((MarkdownHeaderTextSplitter)splitter).splitText(markdownText);
        documents.forEach(doc -> {
            System.out.println("Content: " + doc.getPageContent());
            System.out.println("Metadata: " + doc.getMetadata());
            System.out.println("---");
        });
    }

    private static String getText() {
        return "# BOS-GPTAS-API\n\nBOS-GPTAS-API \u662f\u4e00\u4e2a\u4f01\u4e1a\u7ea7\u667a\u80fd\u670d\u52a1\u6846\u67b6\uff0c\u63d0\u4f9b\u5411\u91cf\u6570\u636e\u5904\u7406\u548c\u5927\u8bed\u8a00\u6a21\u578b\u96c6\u6210\u80fd\u529b\u3002\u672c\u6587\u6863\u4e3b\u8981\u4ecb\u7ecd\u5411\u91cf\u670d\u52a1\u90e8\u5206\uff0c\u5305\u62ec\u6587\u672c\u5411\u91cf\u5316\u5b58\u50a8\u3001\u76f8\u4f3c\u5185\u5bb9\u68c0\u7d22\u4ee5\u53ca\u6279\u91cf\u5904\u7406\u529f\u80fd\u3002\n\n\n\n## 1. \u5411\u91cf\u670d\u52a1 (Vector Service)\n\n\n\n### 1.1 \u529f\u80fd\u7279\u6027\n\n- **\u591a\u6a21\u578b\u652f\u6301**\uff1a\u96c6\u6210\u591a\u79cd\u4e3b\u6d41Embedding\u6a21\u578b\n  - Azure Embedding ADA 002 (1536\u7ef4)\n  - Baidu Embedding V1 (384\u7ef4)\n  - Baidu BGE Large ZH (1024\u7ef4)\n  - Baidu TAO 8K (1024\u7ef4)\n  - Kingdee Embedding (768\u7ef4)\n- **\u5411\u91cf\u5316\u5904\u7406**\uff1a\n  - \u652f\u6301\u5355\u6761\u548c\u6279\u91cf\u6587\u672c\u5411\u91cf\u5316\n  - \u5f02\u6b65\u5904\u7406\u673a\u5236\n  - \u4efb\u52a1\u8fdb\u5ea6\u8ddf\u8e2a\n- **\u76f8\u4f3c\u5185\u5bb9\u68c0\u7d22**\uff1a\n  - \u652f\u6301\u5168\u5c40\u548c\u6307\u5b9a\u8303\u56f4\u7684\u5411\u91cf\u68c0\u7d22\n  - \u652f\u6301TopK\u7ed3\u679c\u8fc7\u6ee4\n  - \u63d0\u4f9b\u76f8\u4f3c\u5ea6\u8bc4\u5206\n- **\u6570\u636e\u7ba1\u7406**\uff1a\u5b8c\u6574\u7684\u5411\u91cf\u6570\u636e\u751f\u547d\u5468\u671f\u7ba1\u7406\n\n\n\n\n\n### 1.2 API \u63a5\u53e3\n\n\u670d\u52a1\u63a5\u53e3\u901a\u8fc7 `VectorService` \u63d0\u4f9b\uff0c\u53ef\u901a\u8fc7 `VectorService.create(EmbeddingModel.xx)` \u83b7\u53d6\u5b9e\u4f8b\u3002\n\n#### 1.2.1 \u76f8\u4f3c\u5185\u5bb9\u68c0\u7d22\n```java\n// \u5168\u5c40\u8303\u56f4\u641c\u7d22\nList<VectorResult> search(\n    List<String> formIds,      // \u8868\u5355ID\u5217\u8868\n    String context,            // \u641c\u7d22\u6587\u672c\n    int topK                   // \u8fd4\u56de\u6570\u91cf\n);\n\n// \u6307\u5b9achunk\u8303\u56f4\u641c\u7d22\nList<VectorResult> search(\n    List<String> formIds,      // \u8868\u5355ID\u5217\u8868\n    List<String> chunkIds,     // chunk ID\u5217\u8868\n    String context,            // \u641c\u7d22\u6587\u672c\n    int topK                   // \u8fd4\u56de\u6570\u91cf\n);\n```\n\n#### 1.2.2 \u5411\u91cf\u5316\u5b58\u50a8\n```java\n// \u5355\u6761\u5b58\u50a8\nvoid save(\n    String formId,            // \u8868\u5355ID\n    Long chunkId,             // \u6570\u636e\u5757ID\n    String context,           // \u6587\u672c\u5185\u5bb9\n);\n\n// \u7ed3\u6784\u5316\u5b58\u50a8\nvoid save(\n    Chunk chunk,              // \u6570\u636e\u5757\u5bf9\u8c61\n);\n\n// \u63d0\u4ea4\u6279\u91cf\u5b58\u50a8\u4efb\u52a1\nString submitSaveTask(\n    List<Chunk> chunks,           // \u6570\u636e\u5757\u5217\u8868\n    Consumer<List<Chunk>> resultCallBack  // \u7ed3\u679c\u56de\u8c03\n);\n\n// \u4efb\u52a1\u8fdb\u5ea6\u67e5\u8be2\nVectorTask getSaveTask(String taskId);\n```\n\n#### 1.2.3 \u6570\u636e\u7ba1\u7406\n```java\n// \u5220\u9664\u5411\u91cf\u6570\u636e\nvoid delete(\n    List<Long> chunkIds,          // \u6570\u636e\u5757ID\u5217\u8868\n);\n```\n\n\n\n### 1.3 \u4f7f\u7528\u793a\u4f8b\n\n```java\n// \u83b7\u53d6\u670d\u52a1\u5b9e\u4f8b\nVectorService vectorService = VectorService.create(EmbeddingModel.BAIDU_EMBEDDING_V1);\n\n// \u5b58\u50a8\u5411\u91cf\u5316\u6587\u672c\nChunk chunk = new Chunk(1000L, \"FORM_001\", \"\u793a\u4f8b\u6587\u672c\u5185\u5bb9\");\nvectorService.save(chunk);\n\n// \u6279\u91cf\u5904\u7406\nString taskId = vectorService.submitSaveTask(\n    chunkList,\n    successChunks -> {\n        // \u5904\u7406\u6210\u529f\u7684\u6570\u636e\u5757\n        successChunks.forEach(c -> System.out.println(\"\u5904\u7406\u6210\u529f: \" + c.getId()));\n    }\n);\n\n// \u67e5\u8be2\u4efb\u52a1\u8fdb\u5ea6\nVectorTask task = vectorService.searchVectorTask(taskId);\nSystem.out.printf(\"\u5904\u7406\u8fdb\u5ea6: %d/%d%n\", \n    task.getCompletedChunkCount(), \n    task.getTotalChunkCount());\n\n// \u76f8\u4f3c\u5185\u5bb9\u641c\u7d22\nList<VectorResult> results = vectorService.search(\n    Arrays.asList(\"FORM_001\"),\n    \"\u67e5\u8be2\u6587\u672c\",\n    5  // topK\n);\n```\n\n\n\n### 1.4 \u5de5\u7a0b\u7ed3\u6784\n\n```mermaid\nclassDiagram\n    class VectorService {\n        <<interface>>\n        +search(formIds, context, topK)\n        +search(formIds, chunkIds, context, topK)\n        +save(formId, chunkId, context)\n        +save(chunk)\n        +submitSaveTask(chunks, resultCallBack)\n        +getSaveTask(taskId)\n        +delete(chunkIds)\n    }\n    \n    class VectorTask {\n        +String taskId\n        +VectorTaskStatus status\n        +int totalChunkCount\n        +int completedChunkCount\n        +int successChunkCount\n        +int failedChunkCount\n        +Set<Long> knowledgeIds\n        +long startTime\n        +long updateTime\n    }\n    \n    class VectorTaskItem {\n        +String chunkId\n        +VectorTaskItemStatus status\n        +String traceId\n        +long updateTime\n    }\n    \n    class Chunk {\n        +Long id\n        +String formId\n        +String context\n        +Long knowledgeId\n    }\n    \n    class VectorResult {\n        +Chunk chunk\n        +float score\n        +int rank\n    }\n    \n    class EmbeddingModel {\n        <<enumeration>>\n        +AZURE_EMBEDDING_ADA_002\n        +BAIDU_EMBEDDING_V1\n        +BAIDU_EMBEDDING_BGE_LARGE_ZH\n        +BAIDU_EMBEDDING_TAO_8K\n        +KINGDEE_EMBEDDING\n        -int dimension\n        -String modelName\n        -String desc\n    }\n    \n    VectorService ..> Chunk\n    VectorService ..> EmbeddingModel\n    VectorService ..> VectorTask\n    VectorResult --> Chunk\n    VectorService ..> VectorResult\n    VectorTask ..> VectorTaskItem\n```\n\n\n\n### 1.5 \u6838\u5fc3\u6a21\u578b\n\n#### 1.5.1 Chunk\uff08\u6570\u636e\u5757\uff09\n\n\u6587\u672c\u5757\u7684\u57fa\u7840\u6570\u636e\u7ed3\u6784\uff0c\u7528\u4e8e\u5411\u91cf\u5316\u5904\u7406\uff1a\n\n```java\npublic class Chunk {\n    Long id;          // \u5411\u91cf\u552f\u4e00\u6807\u8bc6\n    String formId;    // \u8868\u5355/\u4ed3\u5e93\u6807\u8bc6\n    String context;   // \u539f\u59cb\u6587\u672c\u5185\u5bb9\n    Long knowledgeId;     // \u77e5\u8bc6\u5e93ID\n}\n```\n\n#### 1.5.2 VectorTask\uff08\u5411\u91cf\u5316\u4efb\u52a1\uff09\n\n\u6279\u91cf\u5904\u7406\u4efb\u52a1\u7684\u72b6\u6001\u548c\u8fdb\u5ea6\u8ddf\u8e2a\uff1a\n\n```java\npublic class VectorTask {\n    String taskId;           // \u4efb\u52a1\u6807\u8bc6\n    VectorTaskStatus status; // \u4efb\u52a1\u72b6\u6001\n    int totalChunkCount;     // \u603b\u6570\u636e\u5757\u6570\n    int completedChunkCount; // \u5df2\u5b8c\u6210\u6570\n    int successChunkCount;   // \u6210\u529f\u6570\n    int failedChunkCount;    // \u5931\u8d25\u6570\n    long startTime;          // \u5f00\u59cb\u65f6\u95f4\n    long updateTime;         // \u66f4\u65b0\u65f6\u95f4\n}\n```\n\n#### 1.5.3 VectorResult\uff08\u68c0\u7d22\u7ed3\u679c\uff09\n\n\u5411\u91cf\u76f8\u4f3c\u5ea6\u67e5\u8be2\u7ed3\u679c\uff1a\n\n```java\npublic class VectorResult {\n    Chunk chunk;    // \u5339\u914d\u7684\u6570\u636e\u5757\n    float score;    // \u76f8\u4f3c\u5ea6\u8bc4\u5206\n    int rank;       // \u6392\u540d\n}\n```\n\n\n\n### 1.6 \u6ce8\u610f\u4e8b\u9879\n\n1. \u5411\u91cf\u6a21\u578b\u9009\u62e9\uff1a\n   - \u4e0d\u540c\u6a21\u578b\u7684\u5411\u91cf\u7ef4\u5ea6\u4e0d\u540c\uff0c\u9700\u8981\u6839\u636e\u5e94\u7528\u573a\u666f\u9009\u62e9\u5408\u9002\u7684\u6a21\u578b\n   - \u5efa\u8bae\u5728\u540c\u4e00\u5e94\u7528\u573a\u666f\u4e2d\u4fdd\u6301\u6a21\u578b\u4e00\u81f4\u6027\n\n2. \u6279\u91cf\u5904\u7406\uff1a\n   - \u652f\u6301\u5f02\u6b65\u5904\u7406\uff0c\u901a\u8fc7taskId\u8ddf\u8e2a\u8fdb\u5ea6\n   - \u56de\u8c03\u51fd\u6570\u5728\u5e76\u53d1\u73af\u5883\u4e0b\u8c03\u7528\uff0c\u9700\u786e\u4fdd\u7ebf\u7a0b\u5b89\u5168\n   - \u5efa\u8bae\u5b9e\u73b0\u9519\u8bef\u91cd\u8bd5\u673a\u5236\n\n3. \u670d\u52a1\u8bbf\u95ee\uff1a\n   - \u4f5c\u4e3a\u5185\u90e8\u5fae\u670d\u52a1\uff0c\u987b\u901a\u8fc7ServiceFactory\u83b7\u53d6\u5b9e\u4f8b\n   - \u5efa\u8bae\u5728\u5e94\u7528\u542f\u52a8\u65f6\u8fdb\u884c\u670d\u52a1\u8fde\u63a5\u6d4b\u8bd5\n\n\n\n## 2. LLM\u670d\u52a1 (Language Model Service)\n\n\u5927\u8bed\u8a00\u6a21\u578b\u670d\u52a1\u63d0\u4f9b\u7edf\u4e00\u7684\u6a21\u578b\u8c03\u7528\u63a5\u53e3\uff0c\u652f\u6301\u6d41\u5f0f\u8f93\u51fa\u548c\u6807\u8bb0\u8bc6\u522b\u7b49\u9ad8\u7ea7\u7279\u6027\u3002\n\n\n\n### 2.1 \u529f\u80fd\u7279\u6027\n\n- **\u7edf\u4e00\u8c03\u7528\u63a5\u53e3**\uff1a\u652f\u6301\u591a\u79cdLLM\u6a21\u578b\u7684\u7edf\u4e00\u8bbf\u95ee\n- **\u6d41\u5f0f\u8f93\u51fa**\uff1a\u652f\u6301\u5b9e\u65f6\u6d41\u5f0f\u8fd4\u56de\u6a21\u578b\u8f93\u51fa\n- **\u6807\u8bb0\u8bc6\u522b**\uff1a\n  - \u652f\u6301\u81ea\u5b9a\u4e49\u6807\u8bb0\u7684\u8bc6\u522b\u548c\u63d0\u53d6\n  - \u652f\u6301\u5d4c\u5957\u6807\u8bb0\u5904\u7406\n  - \u652f\u6301\u4f18\u5148\u7ea7\u914d\u7f6e\n- **\u7075\u6d3b\u6269\u5c55**\uff1a\n  - \u652f\u6301\u81ea\u5b9a\u4e49\u53c2\u6570\u4f20\u9012\n  - \u652f\u6301\u8f93\u51fa\u5185\u5bb9\u7684\u540e\u5904\u7406\n\n\n\n### 2.2 API \u63a5\u53e3\n\n\u670d\u52a1\u63a5\u53e3\u901a\u8fc7 `LLMService` \u63d0\u4f9b\uff0c\u53ef\u901a\u8fc7 `LLMService.create()` \u83b7\u53d6\u5b9e\u4f8b\u3002\n\n#### 2.2.1 \u57fa\u7840\u6a21\u578b\u8c03\u7528\n```java\nMessageStream llm(\n    String input,                // \u8f93\u5165\u6587\u672c\n    Map<String, Object> customParams  // \u81ea\u5b9a\u4e49\u53c2\u6570\n);\n```\n\n#### 2.2.2 \u6807\u8bb0\u611f\u77e5\u5904\u7406\n```java\nMarkedMessageStream markerAware(\n    MessageStream messageStream,  // \u539f\u59cb\u6d88\u606f\u6d41\n    List<Marker> markers         // \u6807\u8bb0\u5b9a\u4e49\u5217\u8868\n);\n```\n\n\n\n### 2.3 \u4f7f\u7528\u793a\u4f8b\n\n#### 2.3.1 \u57fa\u7840\u6d41\u5f0f\u8c03\u7528\n```java\n// \u83b7\u53d6\u670d\u52a1\u5b9e\u4f8b\nLLMService llmService = LLMService.create();\n\n// \u57fa\u7840\u8c03\u7528\nMap<String, Object> params = new HashMap<>();\nparams.put(\"promptId\", 1000); // promptId \u4e3a\u7528\u6237\u914d\u7f6e\u7684GPT\u63d0\u793a\u8bcdID (long)\nMessageStream stream = llmService.llm(\"\u8bf7\u4ecb\u7ecd\u4e00\u4e0b\u4f60\u81ea\u5df1\", params);\n\n// \u4f7f\u7528\u8fed\u4ee3\u5668\u5904\u7406\u8f93\u51fa\nwhile(stream.hasNext()) {\n    System.out.print(stream.next());\n}\n\n// \u6216\u8005\u76f4\u63a5\u6253\u5370\u5168\u90e8\u5185\u5bb9\nstream.print();\n```\n\n\n\n#### 2.3.2 \u6807\u8bb0\u8bc6\u522b\u5904\u7406\n\n```java\n// \u5b9a\u4e49\u6807\u8bb0\nList<Marker> markers = Arrays.asList(\n    new Marker(\"<kn>\", \"</kn>\"),  // \u77e5\u8bc6\u6807\u8bb0\n    new Marker(\"<code>\", \"</code>\")        // \u4ee3\u7801\u6807\u8bb0\n);\n\n// \u83b7\u53d6\u652f\u6301\u6807\u8bb0\u7684\u6d88\u606f\u6d41\nMarkedMessageStream markedStream = llmService.markerAware(stream, markers);\n\n// \u6d41\u5f0f\u5904\u7406\u5e26\u6807\u8bb0\u7684\u5185\u5bb9\nmarkedStream.stream().forEach(pair -> {\n    if (pair.maker()) {\n        System.out.println(\"\u53d1\u73b0\u6807\u8bb0\u5185\u5bb9: \" + pair.getContent());\n    } else {\n        System.out.print(pair.getContent());\n    }\n});\n\n// \u83b7\u53d6\u6240\u6709\u6807\u8bb0\u5185\u5bb9\nList<MarkerContent> markerContents = markedStream.getAllMarkerContents();\nmarkerContents.forEach(content -> {\n    System.out.println(\"\u6807\u8bb0: \" + content.getMarker().getStartTag());\n    System.out.println(\"\u5185\u5bb9: \" + content.getContent());\n});\n```\n\n\n\n### 2.4 \u5de5\u7a0b\u7ed3\u6784\n\n```mermaid\nclassDiagram\n    class LLMService {\n        <<interface>>\n        +llm(input, customParams)\n        +markerAware(messageStream, markers)\n    }\n    \n    class MessageStream {\n        <<interface>>\n        +fetch()\n        +hasNext()\n        +next()\n    }\n    \n    class MarkedMessageStream {\n        <<interface>>\n        +hasNext()\n        +next()\n        +stream()\n        +getAllMarkerContents()\n    }\n    \n    class Marker {\n        +String startTag\n        +String endTag\n        +boolean isStream\n    }\n    \n    class MarkerContent {\n        +Marker marker\n        +boolean end\n        +String content\n    }\n    \n    class MarkerPair~K,V~ {\n        +K content\n        +V markerContent\n    }\n    \n    LLMService ..> MessageStream\n    LLMService ..> MarkedMessageStream\n    MarkedMessageStream ..> MarkerPair\n    MarkerPair ..> MarkerContent\n    MarkerContent ..> Marker\n```\n\n\n\n### 2.5 \u6838\u5fc3\u6a21\u578b\n\n\n\n#### 2.5.1 MessageStream\uff08\u6d88\u606f\u6d41\uff09\n\n\u57fa\u7840\u6d88\u606f\u6d41\u63a5\u53e3\uff0c\u652f\u6301\u8fed\u4ee3\u548c\u6d41\u5f0f\u83b7\u53d6\uff1a\n\n```java\npublic interface MessageStream extends Iterator<String> {\n    String fetch();             // \u83b7\u53d6\u5b8c\u6574\u6d88\u606f\n}\n```\n\n\n\n#### 2.5.2 Marker\uff08\u6807\u8bb0\u5b9a\u4e49\uff09\n\n\u7528\u4e8e\u5b9a\u4e49\u9700\u8981\u8bc6\u522b\u7684\u6587\u672c\u6807\u8bb0\uff1a\n\n```java\npublic class Marker {\n    String startTag;    // \u8d77\u59cb\u6807\u8bb0\n    String endTag;      // \u7ed3\u675f\u6807\u8bb0\n    boolean isStream;   // \u662f\u5426\u6d41\u5f0f\u5904\u7406\n}\n```\n\n\n\n#### 2.5.3 MarkerContent\uff08\u6807\u8bb0\u5185\u5bb9\uff09\n\n\u6807\u8bb0\u5bf9\u5e94\u7684\u5185\u5bb9\u6570\u636e\uff1a\n\n```java\npublic class MarkerContent {\n    Marker marker;    // \u5173\u8054\u7684\u6807\u8bb0\n    boolean end;      // \u662f\u5426\u7ed3\u675f\u6807\u8bb0\n    String content;   // \u6807\u8bb0\u5185\u5bb9\n}\n```\n\n\n\n#### 2.5.4 MarkedMessageStream\uff08\u5e26\u6807\u8bb0\u7684\u6d88\u606f\u6d41\uff09\n\n\u652f\u6301\u6807\u8bb0\u8bc6\u522b\u7684\u589e\u5f3a\u6d88\u606f\u6d41\uff1a\n\n```java\npublic interface MarkedMessageStream extends Iterator<MarkerPair<String, MarkerContent>> {\n    Stream<MarkerPair<String, MarkerContent>> stream();  // \u6d41\u5f0f\u5904\u7406\n    List<MarkerContent> getAllMarkerContents();          // \u83b7\u53d6\u6240\u6709\u6807\u8bb0\u5185\u5bb9\n}\n```\n\n\n\n### 2.6 \u6ce8\u610f\u4e8b\u9879\n\n1. \u6807\u8bb0\u5904\u7406\uff1a\n   - \u6807\u8bb0\u5fc5\u987b\u914d\u5bf9\u4f7f\u7528\n   - **\u4e0d\u652f\u6301\u6807\u8bb0\u96c6\u5408\u4e2d\u6709\u5305\u542b\u5173\u7cfb** ( \"code\" \u4e0e \"co\" \u6709\u5305\u542b\u5173\u7cfb)\n   - **\u4e0d\u652f\u6301\u6807\u8bb0\u5d4c\u5957**\n\n2. \u6d41\u5f0f\u5904\u7406\uff1a\n   - **MessageStream\u662f\u4e00\u6b21\u6027\u7684\uff0c\u4e0d\u652f\u6301\u91cd\u590d\u8bfb\u53d6**\n   - \u5efa\u8bae\u6839\u636e\u5b9e\u9645\u9700\u6c42\u9009\u62e9\u5408\u9002\u7684\u5904\u7406\u65b9\u5f0f\uff08\u8fed\u4ee3/\u6d41\u5f0f\uff09\n\n3. \u81ea\u5b9a\u4e49\u53c2\u6570\uff1a\n   - **\u53c2\u6570\u540d\u79f0\u9700\u8981\u4e0e\u6a21\u578b\u652f\u6301\u7684\u53c2\u6570\u4fdd\u6301\u4e00\u81f4**\n   - \u5efa\u8bae\u53c2\u8003\u5177\u4f53\u6a21\u578b\u7684\u53c2\u6570\u8bf4\u660e\n\n4. \u9519\u8bef\u5904\u7406\uff1a\n   - \u5efa\u8bae\u5b9e\u73b0\u5f02\u5e38\u5904\u7406\u673a\u5236\n   - \u5bf9\u4e8e\u957f\u6587\u672c\u5904\u7406\uff0c\u5efa\u8bae\u5b9e\u73b0\u8d85\u65f6\u63a7\u5236\n\n";
    }
}

