package com.iailab.module.ai.service.knowledge; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.util.ObjUtil; import cn.hutool.core.util.StrUtil; import com.iailab.framework.common.enums.CommonStatusEnum; import com.iailab.framework.common.pojo.PageResult; import com.iailab.framework.common.util.object.BeanUtils; import com.iailab.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentPageReqVO; import com.iailab.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentProcessRespVO; import com.iailab.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentSaveReqVO; import com.iailab.module.ai.controller.admin.knowledge.vo.segment.AiKnowledgeSegmentUpdateStatusReqVO; import com.iailab.module.ai.dal.dataobject.knowledge.AiKnowledgeDO; import com.iailab.module.ai.dal.dataobject.knowledge.AiKnowledgeDocumentDO; import com.iailab.module.ai.dal.dataobject.knowledge.AiKnowledgeSegmentDO; import com.iailab.module.ai.dal.mysql.knowledge.AiKnowledgeSegmentMapper; import com.iailab.module.ai.service.knowledge.bo.AiKnowledgeSegmentSearchReqBO; import com.iailab.module.ai.service.knowledge.bo.AiKnowledgeSegmentSearchRespBO; import com.iailab.module.ai.service.model.AiModelService; import jakarta.annotation.Resource; import lombok.extern.slf4j.Slf4j; import org.springframework.ai.document.Document; import org.springframework.ai.tokenizer.TokenCountEstimator; import org.springframework.ai.transformer.splitter.TextSplitter; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.SearchRequest; import org.springframework.ai.vectorstore.VectorStore; import org.springframework.ai.vectorstore.filter.FilterExpressionBuilder; import org.springframework.context.annotation.Lazy; import org.springframework.stereotype.Service; import java.util.*; import static com.iailab.framework.common.exception.util.ServiceExceptionUtil.exception; import static com.iailab.framework.common.util.collection.CollectionUtils.convertList; import static com.iailab.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_CONTENT_TOO_LONG; import static com.iailab.module.ai.enums.ErrorCodeConstants.KNOWLEDGE_SEGMENT_NOT_EXISTS; /** * AI 知识库分片 Service 实现类 * * @author xiaoxin */ @Service @Slf4j public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService { private static final String VECTOR_STORE_METADATA_KNOWLEDGE_ID = "knowledgeId"; private static final String VECTOR_STORE_METADATA_DOCUMENT_ID = "documentId"; private static final String VECTOR_STORE_METADATA_SEGMENT_ID = "segmentId"; private static final Map> VECTOR_STORE_METADATA_TYPES = Map.of( VECTOR_STORE_METADATA_KNOWLEDGE_ID, String.class, VECTOR_STORE_METADATA_DOCUMENT_ID, String.class, VECTOR_STORE_METADATA_SEGMENT_ID, String.class); @Resource private AiKnowledgeSegmentMapper segmentMapper; @Resource private AiKnowledgeService knowledgeService; @Resource @Lazy // 延迟加载,避免循环依赖 private AiKnowledgeDocumentService knowledgeDocumentService; @Resource private AiModelService modelService; @Resource private TokenCountEstimator tokenCountEstimator; @Override public PageResult getKnowledgeSegmentPage(AiKnowledgeSegmentPageReqVO pageReqVO) { return segmentMapper.selectPage(pageReqVO); } @Override public void createKnowledgeSegmentBySplitContent(Long documentId, String content) { // 1. 校验 AiKnowledgeDocumentDO documentDO = knowledgeDocumentService.validateKnowledgeDocumentExists(documentId); AiKnowledgeDO knowledgeDO = knowledgeService.validateKnowledgeExists(documentDO.getKnowledgeId()); VectorStore vectorStore = getVectorStoreById(knowledgeDO); // 2. 文档切片 List documentSegments = splitContentByToken(content, documentDO.getSegmentMaxTokens()); // 3.1 存储切片 List segmentDOs = convertList(documentSegments, segment -> { if (StrUtil.isEmpty(segment.getText())) { return null; } return new AiKnowledgeSegmentDO().setKnowledgeId(documentDO.getKnowledgeId()).setDocumentId(documentId) .setContent(segment.getText()).setContentLength(segment.getText().length()) .setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY) .setTokens(tokenCountEstimator.estimate(segment.getText())) .setStatus(CommonStatusEnum.ENABLE.getStatus()); }); segmentMapper.insertBatch(segmentDOs); // 3.2 切片向量化 for (int i = 0; i < documentSegments.size(); i++) { Document segment = documentSegments.get(i); AiKnowledgeSegmentDO segmentDO = segmentDOs.get(i); writeVectorStore(vectorStore, segmentDO, segment); } } @Override public void updateKnowledgeSegment(AiKnowledgeSegmentSaveReqVO reqVO) { // 1. 校验 AiKnowledgeSegmentDO oldSegment = validateKnowledgeSegmentExists(reqVO.getId()); // 2. 删除向量 VectorStore vectorStore = getVectorStoreById(oldSegment.getKnowledgeId()); deleteVectorStore(vectorStore, oldSegment); // 3.1 更新切片 AiKnowledgeSegmentDO newSegment = BeanUtils.toBean(reqVO, AiKnowledgeSegmentDO.class); segmentMapper.updateById(newSegment); // 3.2 重新向量化,必须开启状态 if (CommonStatusEnum.isEnable(oldSegment.getStatus())) { newSegment.setKnowledgeId(oldSegment.getKnowledgeId()).setDocumentId(oldSegment.getDocumentId()); writeVectorStore(vectorStore, newSegment, new Document(newSegment.getContent())); } } @Override public void deleteKnowledgeSegmentByDocumentId(Long documentId) { // 1. 查询需要删除的段落 List segments = segmentMapper.selectListByDocumentId(documentId); if (CollUtil.isEmpty(segments)) { return; } // 2. 批量删除段落记录 segmentMapper.deleteByIds(convertList(segments, AiKnowledgeSegmentDO::getId)); // 3. 删除向量存储中的段落 VectorStore vectorStore = getVectorStoreById(segments.get(0).getKnowledgeId()); vectorStore.delete(convertList(segments, AiKnowledgeSegmentDO::getVectorId)); } @Override public void updateKnowledgeSegmentStatus(AiKnowledgeSegmentUpdateStatusReqVO reqVO) { // 1. 校验 AiKnowledgeSegmentDO segment = validateKnowledgeSegmentExists(reqVO.getId()); // 2. 获取知识库向量实例 VectorStore vectorStore = getVectorStoreById(segment.getKnowledgeId()); // 3. 更新状态 segmentMapper.updateById(new AiKnowledgeSegmentDO().setId(reqVO.getId()).setStatus(reqVO.getStatus())); // 4. 更新向量 if (CommonStatusEnum.isEnable(reqVO.getStatus())) { writeVectorStore(vectorStore, segment, new Document(segment.getContent())); } else { deleteVectorStore(vectorStore, segment); } } @Override public void reindexKnowledgeSegmentByKnowledgeId(Long knowledgeId) { // 1.1 校验知识库存在 AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(knowledgeId); // 1.2 获取知识库向量实例 VectorStore vectorStore = getVectorStoreById(knowledge); // 2.1 查询知识库下的所有启用状态的段落 List segments = segmentMapper.selectListByKnowledgeIdAndStatus( knowledgeId, CommonStatusEnum.ENABLE.getStatus()); if (CollUtil.isEmpty(segments)) { return; } // 2.2 遍历所有段落,重新索引 for (AiKnowledgeSegmentDO segment : segments) { // 删除旧的向量 deleteVectorStore(vectorStore, segment); // 重新创建向量 writeVectorStore(vectorStore, segment, new Document(segment.getContent())); } log.info("[reindexKnowledgeSegmentByKnowledgeId][知识库({}) 重新索引完成,共处理 {} 个段落]", knowledgeId, segments.size()); } private void writeVectorStore(VectorStore vectorStore, AiKnowledgeSegmentDO segmentDO, Document segment) { // 1. 向量存储 // 为什么要 toString 呢?因为部分 VectorStore 实现,不支持 Long 类型,例如说 QdrantVectorStore segment.getMetadata().put(VECTOR_STORE_METADATA_KNOWLEDGE_ID, segmentDO.getKnowledgeId().toString()); segment.getMetadata().put(VECTOR_STORE_METADATA_DOCUMENT_ID, segmentDO.getDocumentId().toString()); segment.getMetadata().put(VECTOR_STORE_METADATA_SEGMENT_ID, segmentDO.getId().toString()); vectorStore.add(List.of(segment)); // 2. 更新向量 ID segmentMapper.updateById(new AiKnowledgeSegmentDO().setId(segmentDO.getId()).setVectorId(segment.getId())); } private void deleteVectorStore(VectorStore vectorStore, AiKnowledgeSegmentDO segmentDO) { // 1. 更新向量 ID if (StrUtil.isEmpty(segmentDO.getVectorId())) { return; } segmentMapper.updateById(new AiKnowledgeSegmentDO().setId(segmentDO.getId()) .setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY)); // 2. 删除向量 vectorStore.delete(List.of(segmentDO.getVectorId())); } @Override public List searchKnowledgeSegment(AiKnowledgeSegmentSearchReqBO reqBO) { // 1. 校验 AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(reqBO.getKnowledgeId()); // 2.1 向量检索 VectorStore vectorStore = getVectorStoreById(knowledge); List documents = vectorStore.similaritySearch(SearchRequest.builder() .query(reqBO.getContent()) .topK(ObjUtil.defaultIfNull(reqBO.getTopK(), knowledge.getTopK())) .similarityThreshold( ObjUtil.defaultIfNull(reqBO.getSimilarityThreshold(), knowledge.getSimilarityThreshold())) .filterExpression(new FilterExpressionBuilder() .eq(VECTOR_STORE_METADATA_KNOWLEDGE_ID, reqBO.getKnowledgeId().toString()) .build()) .build()); if (CollUtil.isEmpty(documents)) { return ListUtil.empty(); } // 2.2 段落召回 List segments = segmentMapper .selectListByVectorIds(convertList(documents, Document::getId)); if (CollUtil.isEmpty(segments)) { return ListUtil.empty(); } // 3. 增加召回次数 segmentMapper.updateRetrievalCountIncrByIds(convertList(segments, AiKnowledgeSegmentDO::getId)); // 4. 构建结果 List result = convertList(segments, segment -> { Document document = CollUtil.findOne(documents, // 找到对应的文档 doc -> Objects.equals(doc.getId(), segment.getVectorId())); if (document == null) { return null; } return BeanUtils.toBean(segment, AiKnowledgeSegmentSearchRespBO.class) .setScore(document.getScore()); }); result.sort((o1, o2) -> Double.compare(o2.getScore(), o1.getScore())); // 按照分数降序排序 return result; } @Override public List splitContent(String url, Integer segmentMaxTokens) { // 1. 读取 URL 内容 String content = knowledgeDocumentService.readUrl(url); // 2. 文档切片 List documentSegments = splitContentByToken(content, segmentMaxTokens); // 3. 转换为段落对象 return convertList(documentSegments, segment -> { if (StrUtil.isEmpty(segment.getText())) { return null; } return new AiKnowledgeSegmentDO() .setContent(segment.getText()) .setContentLength(segment.getText().length()) .setTokens(tokenCountEstimator.estimate(segment.getText())); }); } /** * 校验段落是否存在 * * @param id 文档编号 * @return 段落信息 */ private AiKnowledgeSegmentDO validateKnowledgeSegmentExists(Long id) { AiKnowledgeSegmentDO knowledgeSegment = segmentMapper.selectById(id); if (knowledgeSegment == null) { throw exception(KNOWLEDGE_SEGMENT_NOT_EXISTS); } return knowledgeSegment; } private VectorStore getVectorStoreById(AiKnowledgeDO knowledge) { return modelService.getOrCreateVectorStore(knowledge.getEmbeddingModelId(), VECTOR_STORE_METADATA_TYPES); } private VectorStore getVectorStoreById(Long knowledgeId) { AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(knowledgeId); return getVectorStoreById(knowledge); } private static List splitContentByToken(String content, Integer segmentMaxTokens) { TextSplitter textSplitter = buildTokenTextSplitter(segmentMaxTokens); return textSplitter.apply(Collections.singletonList(new Document(content))); } private static TextSplitter buildTokenTextSplitter(Integer segmentMaxTokens) { return TokenTextSplitter.builder() .withChunkSize(segmentMaxTokens) .withMinChunkSizeChars(Integer.MAX_VALUE) // 忽略字符的截断 .withMinChunkLengthToEmbed(1) // 允许的最小有效分段长度 .withMaxNumChunks(Integer.MAX_VALUE) .withKeepSeparator(true) // 保留分隔符 .build(); } @Override public List getKnowledgeSegmentProcessList(List documentIds) { if (CollUtil.isEmpty(documentIds)) { return Collections.emptyList(); } return segmentMapper.selectProcessList(documentIds); } @Override public Long createKnowledgeSegment(AiKnowledgeSegmentSaveReqVO createReqVO) { // 1.1 校验文档是否存在 AiKnowledgeDocumentDO document = knowledgeDocumentService .validateKnowledgeDocumentExists(createReqVO.getDocumentId()); // 1.2 获取知识库信息 AiKnowledgeDO knowledge = knowledgeService.validateKnowledgeExists(document.getKnowledgeId()); // 1.3 校验 token 熟练 Integer tokens = tokenCountEstimator.estimate(createReqVO.getContent()); if (tokens > document.getSegmentMaxTokens()) { throw exception(KNOWLEDGE_SEGMENT_CONTENT_TOO_LONG, tokens, document.getSegmentMaxTokens()); } // 2. 保存段落 AiKnowledgeSegmentDO segment = BeanUtils.toBean(createReqVO, AiKnowledgeSegmentDO.class) .setKnowledgeId(knowledge.getId()).setDocumentId(document.getId()) .setContentLength(createReqVO.getContent().length()).setTokens(tokens) .setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY) .setRetrievalCount(0).setStatus(CommonStatusEnum.ENABLE.getStatus()); segmentMapper.insert(segment); // 3. 向量化 writeVectorStore(getVectorStoreById(knowledge), segment, new Document(segment.getContent())); return segment.getId(); } @Override public AiKnowledgeSegmentDO getKnowledgeSegment(Long id) { return segmentMapper.selectById(id); } @Override public List getKnowledgeSegmentList(Collection ids) { if (CollUtil.isEmpty(ids)) { return Collections.emptyList(); } return segmentMapper.selectBatchIds(ids); } }