diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java index 44a5e87ee..58a89caee 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeCreateMyReqVO.java @@ -25,4 +25,11 @@ public class AiKnowledgeCreateMyReqVO { @NotNull(message = "嵌入模型不能为空") private Long modelId; + @Schema(description = "相似性阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "0.5") + @NotNull(message = "相似性阈值不能为空") + private Double similarityThreshold; + + @Schema(description = "topK", requiredMode = Schema.RequiredMode.REQUIRED, example = "3") + @NotNull(message = "topK 不能为空") + private Integer topK; } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java index 9cc5290ab..651bdc0f7 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/controller/admin/knowledge/vo/knowledge/AiKnowledgeDocumentCreateReqVO.java @@ -23,4 +23,23 @@ public class AiKnowledgeDocumentCreateReqVO { @URL(message = "文档 URL 格式不正确") private String url; + @Schema(description = "每个文本块的目标 token 数", requiredMode = Schema.RequiredMode.REQUIRED, example = "800") + @NotNull(message = "每个文本块的目标 token 数不能为空") + private Integer defaultChunkSize; + + @Schema(description = "每个文本块的最小字符数", requiredMode = Schema.RequiredMode.REQUIRED, example = "350") + @NotNull(message = "每个文本块的最小字符数不能为空") + private Integer minChunkSizeChars; + + @Schema(description = "丢弃阈值", requiredMode = Schema.RequiredMode.REQUIRED, example = "5") + @NotNull(message = "丢弃阈值不能为空") + private Integer minChunkLengthToEmbed; + + @Schema(description = "最大块数", requiredMode = Schema.RequiredMode.REQUIRED, example = "10000") + @NotNull(message = "最大块数不能为空") + private Integer maxNumChunks; + + @Schema(description = "分块是否保留分隔符", requiredMode = Schema.RequiredMode.REQUIRED, example = "true") + @NotNull(message = "分块是否保留分隔符不能为空") + private Boolean keepSeparator; } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java index 756d8cdb3..5db631dd4 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDO.java @@ -52,6 +52,18 @@ public class AiKnowledgeDO extends BaseDO { * 模型标识 */ private String model; + + /** + * topK + */ + private Integer topK; + + /** + * 相似度阈值 + */ + private Double similarityThreshold; + + /** * 状态 *

diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java index c5e526cce..18fa46c3a 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeDocumentDO.java @@ -23,7 +23,7 @@ public class AiKnowledgeDocumentDO extends BaseDO { private Long id; /** * 知识库编号 - * + *

* 关联 {@link AiKnowledgeDO#getId()} */ private Long knowledgeId; @@ -47,6 +47,26 @@ public class AiKnowledgeDocumentDO extends BaseDO { * 字符数 */ private Integer wordCount; + /** + * 每个文本块的目标 token 数 + */ + private Integer defaultChunkSize; + /** + * 每个文本块的最小字符数 + */ + private Integer minChunkSizeChars; + /** + * 低于此值的块会被丢弃 + */ + private Integer minChunkLengthToEmbed; + /** + * 最大块数 + */ + private Integer maxNumChunks; + /** + * 分块是否保留分隔符 + */ + private Boolean keepSeparator; /** * 切片状态 *

diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java index 84f7de654..be57265e1 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/dal/dataobject/knowledge/AiKnowledgeSegmentDO.java @@ -2,6 +2,8 @@ package cn.iocoder.yudao.module.ai.dal.dataobject.knowledge; import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum; import cn.iocoder.yudao.framework.mybatis.core.dataobject.BaseDO; +import com.baomidou.mybatisplus.annotation.FieldStrategy; +import com.baomidou.mybatisplus.annotation.TableField; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; import lombok.Data; @@ -25,16 +27,17 @@ public class AiKnowledgeSegmentDO extends BaseDO { /** * 向量库的编号 */ + @TableField(updateStrategy = FieldStrategy.ALWAYS) private String vectorId; /** * 知识库编号 - * + *

* 关联 {@link AiKnowledgeDO#getId()} */ private Long knowledgeId; /** * 文档编号 - * + *

* 关联 {@link AiKnowledgeDocumentDO#getId()} */ private Long documentId; @@ -52,7 +55,7 @@ public class AiKnowledgeSegmentDO extends BaseDO { private Integer tokens; /** * 状态 - * + *

* 枚举 {@link CommonStatusEnum} */ private Integer status; diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java index 99f0621c8..05a9dce22 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeDocumentServiceImpl.java @@ -9,15 +9,11 @@ import cn.iocoder.yudao.framework.common.util.object.BeanUtils; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentPageReqVO; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.document.AiKnowledgeDocumentUpdateReqVO; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeDocumentCreateReqVO; -import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO; import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO; import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO; -import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO; import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeDocumentMapper; import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper; import cn.iocoder.yudao.module.ai.enums.knowledge.AiKnowledgeDocumentStatusEnum; -import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService; -import cn.iocoder.yudao.module.ai.service.model.AiChatModelService; import jakarta.annotation.Resource; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.document.Document; @@ -48,24 +44,16 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic @Resource private AiKnowledgeSegmentMapper segmentMapper; - @Resource - private TokenTextSplitter tokenTextSplitter; @Resource private TokenCountEstimator tokenCountEstimator; - - @Resource - private AiApiKeyService apiKeyService; @Resource private AiKnowledgeService knowledgeService; - @Resource - private AiChatModelService chatModelService; @Override @Transactional(rollbackFor = Exception.class) public Long createKnowledgeDocument(AiKnowledgeDocumentCreateReqVO createReqVO) { - // 0. 校验 - AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(createReqVO.getKnowledgeId()); - AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId()); + // 0. 校验并获取向量存储实例 + VectorStore vectorStore = knowledgeService.getVectorStoreById(createReqVO.getKnowledgeId()); // 1.1 下载文档 TikaDocumentReader loader = new TikaDocumentReader(downloadFile(createReqVO.getUrl())); @@ -82,6 +70,9 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic return documentId; } + // 2 构造文本分段器 + TokenTextSplitter tokenTextSplitter = new TokenTextSplitter(createReqVO.getDefaultChunkSize(), createReqVO.getMinChunkSizeChars(), createReqVO.getMinChunkLengthToEmbed(), + createReqVO.getMaxNumChunks(), createReqVO.getKeepSeparator()); // 2.1 文档分段 List segments = tokenTextSplitter.apply(documents); // 2.2 分段内容入库 @@ -92,8 +83,6 @@ public class AiKnowledgeDocumentServiceImpl implements AiKnowledgeDocumentServic .setStatus(CommonStatusEnum.ENABLE.getStatus())); segmentMapper.insertBatch(segmentDOList); - // 3.1 获取向量存储实例 - VectorStore vectorStore = apiKeyService.getOrCreateVectorStore(model.getKeyId()); // 3.2 向量化并存储 segments.forEach(segment -> segment.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, createReqVO.getKnowledgeId())); vectorStore.add(segments); diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java index 3bcf5d692..1813d0b48 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java @@ -2,6 +2,7 @@ package cn.iocoder.yudao.module.ai.service.knowledge; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; +import cn.iocoder.yudao.framework.common.enums.CommonStatusEnum; import cn.iocoder.yudao.framework.common.pojo.PageResult; import cn.iocoder.yudao.framework.common.util.object.BeanUtils; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentPageReqVO; @@ -23,6 +24,10 @@ import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder; import org.springframework.stereotype.Service; import java.util.List; +import java.util.Objects; + +import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; +import static cn.iocoder.yudao.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_NOT_EXISTS; /** * AI 知识库分片 Service 实现类 @@ -50,14 +55,45 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService @Override public void updateKnowledgeSegment(AiKnowledgeSegmentUpdateReqVO reqVO) { - segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class)); - // TODO @xin 重新向量化 + // 0 校验 + AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId()); + // 2.1 获取知识库向量实例 + VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId()); + // 2.2 删除原向量 + vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId())); + + // 2.3 重新向量化 + Document document = new Document(reqVO.getContent()); + document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId()); + vectorStore.add(List.of(document)); + + // 2.1 更新段落内容 + AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class); + knowledgeSegment.setVectorId(document.getId()); + segmentMapper.updateById(knowledgeSegment); } @Override public void updateKnowledgeSegmentStatus(AiKnowledgeSegmentUpdateStatusReqVO reqVO) { - segmentMapper.updateById(BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class)); - // TODO @xin 1.禁用删除向量 2.启用重新向量化 + // 0 校验 + AiKnowledgeSegmentDO oldKnowledgeSegment = validateKnowledgeSegmentExists(reqVO.getId()); + // 1 获取知识库向量实例 + VectorStore vectorStore = knowledgeService.getVectorStoreById(oldKnowledgeSegment.getKnowledgeId()); + AiKnowledgeSegmentDO knowledgeSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class); + + if (Objects.equals(reqVO.getStatus(), CommonStatusEnum.ENABLE.getStatus())) { + // 2.1 启用重新向量化 + Document document = new Document(oldKnowledgeSegment.getContent()); + document.getMetadata().put(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, oldKnowledgeSegment.getKnowledgeId()); + vectorStore.add(List.of(document)); + knowledgeSegment.setVectorId(document.getId()); + } else { + // 2.2 禁用删除向量 + vectorStore.delete(List.of(oldKnowledgeSegment.getVectorId())); + knowledgeSegment.setVectorId(null); + } + // 3 更新段落状态 + segmentMapper.updateById(knowledgeSegment); } @Override @@ -71,9 +107,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService // 1.2 向量检索 List documentList = vectorStore.similaritySearch(SearchRequest.query(reqVO.getContent()) - //TODO @xin 配置提取 - .withTopK(5) - .withSimilarityThreshold(0.5d) + .withTopK(knowledge.getTopK()) + .withSimilarityThreshold(knowledge.getSimilarityThreshold()) .withFilterExpression(new FilterExpressionBuilder().eq(AiKnowledgeSegmentDO.FIELD_KNOWLEDGE_ID, reqVO.getKnowledgeId()).build())); if (CollUtil.isEmpty(documentList)) { return ListUtil.empty(); @@ -81,4 +116,19 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService // 2.1 段落召回 return segmentMapper.selectList(CollUtil.getFieldValues(documentList, "id", String.class)); } + + + /** + * 校验段落是否存在 + * + * @param id 文档编号 + * @return 段落信息 + */ + private AiKnowledgeSegmentDO validateKnowledgeSegmentExists(Long id) { + AiKnowledgeSegmentDO knowledgeSegment = segmentMapper.selectById(id); + if (knowledgeSegment == null) { + throw exception(KNOWLEDGE_SEGMENT_NOT_EXISTS); + } + return knowledgeSegment; + } } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeService.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeService.java index 9f43c5328..d9770f452 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeService.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeService.java @@ -5,6 +5,7 @@ import cn.iocoder.yudao.framework.common.pojo.PageResult; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeCreateMyReqVO; import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnowledgeUpdateMyReqVO; import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO; +import org.springframework.ai.vectorstore.VectorStore; /** * AI 知识库-基础信息 Service 接口 @@ -47,4 +48,12 @@ public interface AiKnowledgeService { * @return 知识库分页 */ PageResult getKnowledgePageMy(Long userId, PageParam pageReqVO); + + /** + * 根据知识库编号获取向量存储实例 + * + * @param knowledgeId 知识库编号 + * @return 向量存储实例 + */ + VectorStore getVectorStoreById(Long knowledgeId); } diff --git a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeServiceImpl.java b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeServiceImpl.java index 1948bb00e..7a145d734 100644 --- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeServiceImpl.java +++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeServiceImpl.java @@ -10,9 +10,11 @@ import cn.iocoder.yudao.module.ai.controller.admin.knowledge.vo.knowledge.AiKnow import cn.iocoder.yudao.module.ai.dal.dataobject.knowledge.AiKnowledgeDO; import cn.iocoder.yudao.module.ai.dal.dataobject.model.AiChatModelDO; import cn.iocoder.yudao.module.ai.dal.mysql.knowledge.AiKnowledgeMapper; +import cn.iocoder.yudao.module.ai.service.model.AiApiKeyService; import cn.iocoder.yudao.module.ai.service.model.AiChatModelService; import jakarta.annotation.Resource; import lombok.extern.slf4j.Slf4j; +import org.springframework.ai.vectorstore.VectorStore; import org.springframework.stereotype.Service; import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception; @@ -32,6 +34,10 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService { @Resource private AiKnowledgeMapper knowledgeMapper; + @Resource + private AiChatModelService chatModelService; + @Resource + private AiApiKeyService apiKeyService; @Override public Long createKnowledgeMy(AiKnowledgeCreateMyReqVO createReqVO, Long userId) { @@ -75,4 +81,11 @@ public class AiKnowledgeServiceImpl implements AiKnowledgeService { return knowledgeMapper.selectPageByMy(userId, pageReqVO); } + @Override + public VectorStore getVectorStoreById(Long knowledgeId) { + AiKnowledgeDO knowledge = validateKnowledgeExists(knowledgeId); + AiChatModelDO model = chatModelService.validateChatModel(knowledge.getModelId()); + return apiKeyService.getOrCreateVectorStore(model.getKeyId()); + } + }