diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java index 44a5e87ee..58a89caee 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java @@ -25,4 +25,11 @@ public class AiKnowledgeCreateMyReqVO { @NotNull(message = "嵌入模型不能为空") private Long modelId; + @Schema(description = "相似性阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "0.5") + @NotNull(message = "相似性阈值不能为空") + private Double similarityThreshold; + + @Schema(description = "topK", requiredMode = Schema.RequiredMode.REQUIRED, example = "3") + @NotNull(message = "topK 不能为空") + private Integer topK; } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java index 9cc5290ab..651bdc0f7 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java @@ -23,4 +23,23 @@ public class AiKnowledgeDocumentCreateReqVO { @URL(message = "文档 URL 格式不正确") private String url; + @Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800") + @NotNull(message = "每个文本块的目标 token 数不能为空") + private Integer defaultChunkSize; + + @Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350") + @NotNull(message = "每个文本块的最小字符数不能为空") + private Integer minChunkSizeChars; + + @Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5") + @NotNull(message = "丢弃阈值不能为空") + private Integer minChunkLengthToEmbed; + + @Schema(description = "最大块数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000") + @NotNull(message = "最大块数不能为空") + private Integer maxNumChunks; + + @Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true") + @NotNull(message = "分块是否保留分隔符不能为空") + private Boolean keepSeparator; } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java index 756d8cdb3..5db631dd4 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java @@ -52,6 +52,18 @@ public class AiKnowledgeDO extends BaseDO { * 模型标识 */ private String model; + + /** + * topK + */ + private Integer topK; + + /** + * 相似度阈值 + */ + private Double similarityThreshold; + + /** * 状态 *
diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java index c5e526cce..18fa46c3a 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java @@ -23,7 +23,7 @@ public class AiKnowledgeDocumentDO extends BaseDO { private Long id; /** * 知识库编号 - * + *
* 关联 {@link AiKnowledgeDO#getId()} */ private Long knowledgeId; @@ -47,6 +47,26 @@ public class AiKnowledgeDocumentDO extends BaseDO { * 字符数 */ private Integer wordCount; + /** + * 每个文本块的目标 token 数 + */ + private Integer defaultChunkSize; + /** + * 每个文本块的最小字符数 + */ + private Integer minChunkSizeChars; + /** + * 低于此值的块会被丢弃 + */ + private Integer minChunkLengthToEmbed; + /** + * 最大块数 + */ + private Integer maxNumChunks; + /** + * 分块是否保留分隔符 + */ + private Boolean keepSeparator; /** * 切片状态 *
diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java index 84f7de654..be57265e1 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java @@ -2,6 +2,8 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge; import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum; import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.FieldStrategy; +import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; @@ -25,16 +27,17 @@ public class AiKnowledgeSegmentDO extends BaseDO { /** * 向量库的编号 */ + @TableField(updateStrategy = FieldStrategy.ALWAYS) private String vectorId; /** * 知识库编号 - * + *
* 关联 {@link AiKnowledgeDO#getId()} */ private Long knowledgeId; /** * 文档编号 - * + *
* 关联 {@link AiKnowledgeDocumentDO#getId()} */ private Long documentId; @@ -52,7 +55,7 @@ public class AiKnowledgeSegmentDO extends BaseDO { private Integer tokens; /** * 状态 - * + *
* 枚举 {@link CommonStatusEnum}
*/
private Integer status;
diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java
index 99f0621c8..05a9dce22 100644
--- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java
+++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java
@@ -9,15 +9,11 @@ import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentPageReqVO;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentUpdateReqVO;
import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeDocumentCreateReqVO;
-import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO;
import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO;
-import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO;
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeDocumentMapper;
import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper;
import cn.iocoder.yudao.module.ai.enums.knowledge.AiKnowledgeDocumentStatusEnum;
-import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService;
-import cn.iocoder.yudao.module.ai.service.model.AiChatModelService;
import jakarta.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
@@ -48,24 +44,16 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
@Resource
private AiKnowledgeSegmentMapper segmentMapper;
- @Resource
- private TokenTextSplitter tokenTextSplitter;
@Resource
private TokenCountEstimator tokenCountEstimator;
-
- @Resource
- private AiApiKeyService apiKeyService;
@Resource
private AiKnowledgeService knowledgeService;
- @Resource
- private AiChatModelService chatModelService;
@Override
@Transactional(rollbackFor = Exception.class)
public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) {
- // 0. 校验
- AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(createReqVO.getKnowledgeId());
- AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId());
+ // 0. 校验并获取向量存储实例
+ VectorStore vectorStore = knowledgeService.getVectorStoreById(createReqVO.getKnowledgeId());
// 1.1 下载文档
TikaDocumentReader loader = new TikaDocumentReader(downloadFile(createReqVO.getUrl()));
@@ -82,6 +70,9 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic
return documentId;
}
+ // 2 构造文本分段器
+ TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(),
+ createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator());
// 2.1 文档分段
List