504 lines
18 KiB
JavaScript
504 lines
18 KiB
JavaScript
// js/chatbot/agents/semantic-vector-search.js
|
||
// 意群向量搜索集成层
|
||
(function(window) {
|
||
'use strict';
|
||
|
||
/**
|
||
* 意群向量搜索引擎
|
||
* 集成 EmbeddingClient 和 VectorStore
|
||
*/
|
||
class SemanticVectorSearch {
|
||
constructor() {
|
||
this.vectorStore = null;
|
||
this.initialized = false;
|
||
this.indexedDocs = new Set(); // 已建立索引的文档ID
|
||
this._rerankLoading = null; // 懒加载 RerankClient
|
||
}
|
||
|
||
/**
|
||
* 初始化(检查配置)
|
||
*/
|
||
async init() {
|
||
if (this.initialized) return true;
|
||
|
||
// 检查依赖
|
||
if (!window.EmbeddingClient || !window.VectorStore) {
|
||
console.warn('[SemanticVectorSearch] 依赖未加载');
|
||
return false;
|
||
}
|
||
|
||
// 检查Embedding配置
|
||
if (!window.EmbeddingClient.config.enabled || !window.EmbeddingClient.config.apiKey) {
|
||
console.warn('[SemanticVectorSearch] Embedding API未配置');
|
||
return false;
|
||
}
|
||
|
||
this.initialized = true;
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* 为意群建立向量索引
|
||
* @param {Array<Object>} groups - 意群数组
|
||
* @param {string} docId - 文档ID
|
||
* @param {Object} options - 选项
|
||
*/
|
||
async indexGroups(groups, docId, options = {}) {
|
||
const { showProgress = true, forceRebuild = false } = options;
|
||
|
||
if (!await this.init()) {
|
||
throw new Error('向量搜索未初始化');
|
||
}
|
||
|
||
// 检查是否已索引
|
||
if (this.indexedDocs.has(docId) && !forceRebuild) {
|
||
console.log(`[SemanticVectorSearch] 文档 ${docId} 已建立索引,跳过`);
|
||
return;
|
||
}
|
||
|
||
// 创建或获取VectorStore
|
||
if (!this.vectorStore || this.vectorStore.namespace !== docId) {
|
||
this.vectorStore = new window.VectorStore(docId);
|
||
await this.vectorStore.init();
|
||
}
|
||
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('正在建立向量索引...', 'info', 3000);
|
||
}
|
||
|
||
try {
|
||
// 准备文本:关键词 + 摘要 + 完整digest
|
||
const texts = groups.map(g => {
|
||
const keywords = (g.keywords || []).join(' ');
|
||
const summary = g.summary || '';
|
||
const digest = g.digest || ''; // 使用完整digest,不截断
|
||
return `${keywords}\n${summary}\n${digest}`.trim();
|
||
});
|
||
|
||
console.log(`[SemanticVectorSearch] 开始生成 ${texts.length} 个向量...`);
|
||
|
||
// 批量生成向量
|
||
const vectors = await window.EmbeddingClient.batchEmbed(texts);
|
||
|
||
// 批量存储(过滤失败的向量)
|
||
const items = [];
|
||
let skipped = 0;
|
||
groups.forEach((g, idx) => {
|
||
const vec = vectors[idx];
|
||
if (!Array.isArray(vec)) { skipped++; return; }
|
||
items.push({
|
||
id: g.groupId,
|
||
vector: vec,
|
||
metadata: {
|
||
docId: docId,
|
||
groupId: g.groupId,
|
||
charCount: g.charCount,
|
||
keywords: g.keywords,
|
||
summary: g.summary,
|
||
segments: g.segments
|
||
}
|
||
});
|
||
});
|
||
|
||
await this.vectorStore.batchUpsert(items);
|
||
|
||
// 加载到内存索引
|
||
await this.vectorStore.loadMemoryIndex();
|
||
|
||
this.indexedDocs.add(docId);
|
||
|
||
console.log(`[SemanticVectorSearch] 向量索引建立完成,共 ${items.length} 个意群${skipped>0?`(跳过 ${skipped} 个失败向量)`:''}`);
|
||
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('向量索引建立完成', 'success', 2000);
|
||
}
|
||
|
||
// 保存索引状态到 window.data
|
||
if (window.data) {
|
||
window.data.vectorIndexReady = true;
|
||
window.data.vectorIndexTimestamp = Date.now();
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error('[SemanticVectorSearch] 建立索引失败:', error);
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('向量索引建立失败', 'error', 3000);
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 为chunks建立向量索引(新版)
|
||
* @param {Array<Object>} chunks - enrichedChunks数组
|
||
* @param {string} docId - 文档ID
|
||
* @param {Object} options - 选项
|
||
*/
|
||
async indexChunks(chunks, docId, options = {}) {
|
||
const { showProgress = true, forceRebuild = false } = options;
|
||
|
||
if (!await this.init()) {
|
||
throw new Error('向量搜索未初始化');
|
||
}
|
||
|
||
// 创建或获取VectorStore
|
||
if (!this.vectorStore || this.vectorStore.namespace !== docId) {
|
||
this.vectorStore = new window.VectorStore(docId);
|
||
await this.vectorStore.init();
|
||
}
|
||
|
||
// 检查IndexedDB中是否已有向量(而不是仅检查内存)
|
||
if (!forceRebuild) {
|
||
try {
|
||
await this.vectorStore.loadMemoryIndex();
|
||
const existingCount = this.vectorStore.memoryIndex?.length || 0;
|
||
|
||
// 如果向量数量匹配,说明已索引,直接使用
|
||
if (existingCount === chunks.length) {
|
||
console.log(`[SemanticVectorSearch] 文档 ${docId} 已有 ${existingCount} 个向量缓存,直接使用`);
|
||
this.indexedDocs.add(docId);
|
||
|
||
if (window.data) {
|
||
window.data.vectorIndexReady = true;
|
||
window.data.vectorIndexTimestamp = Date.now();
|
||
}
|
||
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('向量索引已就绪(从缓存加载)', 'success', 2000);
|
||
}
|
||
|
||
return;
|
||
} else if (existingCount > 0) {
|
||
console.warn(`[SemanticVectorSearch] 向量数量不匹配(缓存${existingCount}个,当前${chunks.length}个),重新生成`);
|
||
}
|
||
} catch (err) {
|
||
console.warn('[SemanticVectorSearch] 加载向量缓存失败,将重新生成:', err);
|
||
}
|
||
}
|
||
|
||
// 创建进度toast
|
||
let progressToast = null;
|
||
if (showProgress && window.ChatbotUtils && typeof window.ChatbotUtils.showProgressToast === 'function') {
|
||
progressToast = window.ChatbotUtils.showProgressToast('开始生成向量索引...', 0);
|
||
}
|
||
|
||
try {
|
||
// 准备文本:直接使用chunk的text
|
||
const texts = chunks.map(c => c.text);
|
||
|
||
console.log(`[SemanticVectorSearch] 开始生成 ${texts.length} 个chunk向量...`);
|
||
|
||
// 批量生成向量(带进度回调)
|
||
const vectors = await window.EmbeddingClient.batchEmbed(texts, {
|
||
onProgress: (current, total, message) => {
|
||
const percent = Math.round((current / total) * 100);
|
||
if (progressToast && typeof progressToast.update === 'function') {
|
||
progressToast.update(`${message} (${percent}%)`, percent);
|
||
}
|
||
console.log(`[SemanticVectorSearch] 向量生成进度: ${current}/${total} (${percent}%)`);
|
||
}
|
||
});
|
||
|
||
// 批量存储(过滤失败的向量)
|
||
const items = [];
|
||
let skipped = 0;
|
||
chunks.forEach((chunk, idx) => {
|
||
const vec = vectors[idx];
|
||
if (!Array.isArray(vec)) { skipped++; return; }
|
||
items.push({
|
||
id: chunk.chunkId,
|
||
vector: vec,
|
||
metadata: {
|
||
docId: docId,
|
||
chunkId: chunk.chunkId,
|
||
belongsToGroup: chunk.belongsToGroup,
|
||
position: chunk.position,
|
||
charCount: chunk.charCount,
|
||
text: (chunk.text || '').substring(0, 200) // 只存储前200字作为预览
|
||
}
|
||
});
|
||
});
|
||
|
||
await this.vectorStore.batchUpsert(items);
|
||
|
||
// 加载到内存索引
|
||
await this.vectorStore.loadMemoryIndex();
|
||
|
||
this.indexedDocs.add(docId);
|
||
|
||
console.log(`[SemanticVectorSearch] 向量索引建立完成,共 ${items.length} 个chunks${skipped>0?`(跳过 ${skipped} 个失败向量)`:''}`);
|
||
|
||
// 关闭进度toast,显示成功提示
|
||
if (progressToast && typeof progressToast.close === 'function') {
|
||
progressToast.close();
|
||
}
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('向量索引建立完成', 'success', 2000);
|
||
}
|
||
|
||
// 保存索引状态到 window.data
|
||
if (window.data) {
|
||
window.data.vectorIndexReady = true;
|
||
window.data.vectorIndexTimestamp = Date.now();
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error('[SemanticVectorSearch] 建立索引失败:', error);
|
||
|
||
// 关闭进度toast
|
||
if (progressToast && typeof progressToast.close === 'function') {
|
||
progressToast.close();
|
||
}
|
||
|
||
if (showProgress && window.ChatbotUtils?.showToast) {
|
||
window.ChatbotUtils.showToast('向量索引建立失败', 'error', 3000);
|
||
}
|
||
throw error;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 向量检索chunks(纯向量搜索,不做降级)
|
||
* @param {string} query - 用户查询
|
||
* @param {Array<Object>} chunks - enrichedChunks数组(用于返回完整chunk对象)
|
||
* @param {Object} options - 选项
|
||
* @returns {Promise<Array<Object>>} 匹配的chunks
|
||
*/
|
||
async search(query, chunks = [], options = {}) {
|
||
const { topK = 10, threshold = 0.3 } = options;
|
||
|
||
if (!await this.init()) {
|
||
console.warn('[SemanticVectorSearch] 向量搜索未初始化');
|
||
return [];
|
||
}
|
||
|
||
const docId = this.getCurrentDocId();
|
||
|
||
// 确保有VectorStore
|
||
if (!this.vectorStore || this.vectorStore.namespace !== docId) {
|
||
this.vectorStore = new window.VectorStore(docId);
|
||
await this.vectorStore.init();
|
||
}
|
||
|
||
// 检查索引是否存在
|
||
const stats = await this.vectorStore.stats();
|
||
if (stats.count === 0) {
|
||
console.warn('[SemanticVectorSearch] 索引为空');
|
||
return [];
|
||
}
|
||
|
||
try {
|
||
// 生成查询向量
|
||
const queryVector = await window.EmbeddingClient.embed(query);
|
||
|
||
// 向量检索
|
||
const results = await this.vectorStore.search(queryVector, topK);
|
||
|
||
// 过滤低分结果
|
||
const filtered = results.filter(r => r.score >= threshold);
|
||
|
||
console.log(`[SemanticVectorSearch] 向量检索匹配 ${filtered.length} 个chunks,分数范围: ${filtered[0]?.score.toFixed(3)} - ${filtered[filtered.length - 1]?.score.toFixed(3)}`);
|
||
|
||
// 从chunks中找出对应的完整chunk对象
|
||
const chunkMap = new Map(chunks.map(c => [c.chunkId, c]));
|
||
const matchedChunks = filtered
|
||
.map(r => {
|
||
const chunk = chunkMap.get(r.metadata.chunkId);
|
||
if (chunk) {
|
||
return {
|
||
...chunk,
|
||
score: r.score
|
||
};
|
||
}
|
||
return null;
|
||
})
|
||
.filter(Boolean)
|
||
.slice(0, topK);
|
||
|
||
// 尝试使用重排(如果启用)
|
||
// 若尚未加载 RerankClient,尝试懒加载一次
|
||
if (!window.RerankClient) {
|
||
await this._ensureRerankClientLoaded();
|
||
}
|
||
// 诊断日志:打印是否加载、触发条件与精简配置
|
||
(function(){
|
||
try {
|
||
const hasRerank = !!window.RerankClient;
|
||
const should = hasRerank ? window.RerankClient.shouldRerank('vector') : false;
|
||
const cfg = hasRerank ? window.RerankClient.config || {} : {};
|
||
console.log('[SemanticVectorSearch][diag] Rerank loaded:', hasRerank, '| shouldRerank(vector):', should, '| cfg:', {
|
||
enabled: cfg.enabled,
|
||
scope: cfg.scope,
|
||
provider: cfg.provider,
|
||
endpoint: cfg.endpoint,
|
||
model: cfg.model,
|
||
topN: cfg.topN
|
||
});
|
||
} catch (e) { /* ignore diag errors */ }
|
||
})();
|
||
|
||
if (window.RerankClient && window.RerankClient.shouldRerank('vector')) {
|
||
try {
|
||
console.log(`[SemanticVectorSearch] 对 ${matchedChunks.length} 个结果进行重排...`);
|
||
|
||
// 准备文档文本
|
||
const docs = matchedChunks.map(c => c.text || '');
|
||
|
||
// 调用重排
|
||
const rerankResults = await window.RerankClient.rerank(query, docs, {
|
||
topN: topK,
|
||
searchType: 'vector'
|
||
});
|
||
|
||
// 根据重排结果重新排序
|
||
const rerankedChunks = rerankResults.map(r => ({
|
||
...matchedChunks[r.index],
|
||
rerankScore: r.relevance_score,
|
||
originalScore: matchedChunks[r.index].score,
|
||
score: r.relevance_score // 使用重排分数作为最终分数
|
||
}));
|
||
|
||
console.log(`[SemanticVectorSearch] 重排完成,返回 ${rerankedChunks.length} 个结果`);
|
||
return rerankedChunks;
|
||
} catch (error) {
|
||
console.warn('[SemanticVectorSearch] 重排失败,使用原始结果:', error);
|
||
// 失败时返回原始结果
|
||
return matchedChunks;
|
||
}
|
||
} else {
|
||
console.log('[SemanticVectorSearch] 跳过重排(shouldRerank=false 或 RerankClient 未加载)');
|
||
}
|
||
|
||
return matchedChunks;
|
||
|
||
} catch (error) {
|
||
console.error('[SemanticVectorSearch] 检索失败:', error);
|
||
return [];
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 懒加载 RerankClient 脚本(避免某些页面未引入导致无法重排)
|
||
*/
|
||
async _ensureRerankClientLoaded() {
|
||
if (window.RerankClient) return true;
|
||
if (this._rerankLoading) return this._rerankLoading;
|
||
|
||
const pickCandidates = () => {
|
||
const candidates = [];
|
||
const scripts = Array.from(document.getElementsByTagName('script'));
|
||
// 1) 与当前 semantic-vector-search.js 同目录
|
||
const sem = scripts.find(s => (s.src || '').includes('semantic-vector-search.js'));
|
||
if (sem && sem.src) {
|
||
try { candidates.push(sem.src.replace('semantic-vector-search.js', 'rerank-client.js')); } catch {}
|
||
}
|
||
// 2) 与已加载的 embedding-client.js 同目录
|
||
const emb = scripts.find(s => (s.src || '').includes('embedding-client.js'));
|
||
if (emb && emb.src) {
|
||
try { candidates.push(emb.src.replace('embedding-client.js', 'rerank-client.js')); } catch {}
|
||
}
|
||
// 3) 文档相对路径(可能不适配 views/history 场景,但作为兜底)
|
||
try { candidates.push(new URL('js/chatbot/agents/rerank-client.js', document.baseURI).toString()); } catch {}
|
||
// 4) 去重
|
||
return Array.from(new Set(candidates.filter(Boolean)));
|
||
};
|
||
|
||
const candidates = pickCandidates();
|
||
console.log('[SemanticVectorSearch][diag] 试图动态加载 RerankClient,候选URL:', candidates);
|
||
|
||
this._rerankLoading = new Promise((resolve) => {
|
||
try {
|
||
// 若已有任意 rerank-client 脚本标签,等待就绪
|
||
const existingTag = Array.from(document.getElementsByTagName('script')).find(s => (s.src || '').includes('rerank-client.js'));
|
||
if (existingTag) {
|
||
setTimeout(() => resolve(!!window.RerankClient), 150);
|
||
return;
|
||
}
|
||
|
||
const tryLoad = (idx) => {
|
||
if (idx >= candidates.length) {
|
||
console.warn('[SemanticVectorSearch][diag] 动态加载 RerankClient 失败(无可用URL)');
|
||
resolve(false);
|
||
return;
|
||
}
|
||
const url = candidates[idx];
|
||
const s = document.createElement('script');
|
||
s.src = url;
|
||
s.async = true;
|
||
s.onload = () => {
|
||
console.log('[SemanticVectorSearch][diag] 动态加载 RerankClient 成功:', url);
|
||
resolve(!!window.RerankClient);
|
||
};
|
||
s.onerror = () => {
|
||
console.warn('[SemanticVectorSearch][diag] 加载失败,尝试下一个URL:', url);
|
||
// 尝试下一个候选
|
||
tryLoad(idx + 1);
|
||
};
|
||
document.head.appendChild(s);
|
||
};
|
||
|
||
tryLoad(0);
|
||
} catch (e) {
|
||
console.warn('[SemanticVectorSearch][diag] 懒加载 RerankClient 异常:', e);
|
||
resolve(false);
|
||
}
|
||
});
|
||
const ok = await this._rerankLoading;
|
||
this._rerankLoading = null;
|
||
return ok;
|
||
}
|
||
|
||
/**
|
||
* 获取当前文档ID
|
||
*/
|
||
getCurrentDocId() {
|
||
if (window.ChatbotCore?.getCurrentDocId) {
|
||
return window.ChatbotCore.getCurrentDocId();
|
||
}
|
||
if (window.data?.id) {
|
||
return window.data.id;
|
||
}
|
||
return 'default';
|
||
}
|
||
|
||
/**
|
||
* 删除文档索引
|
||
*/
|
||
async deleteIndex(docId) {
|
||
if (!this.vectorStore) {
|
||
this.vectorStore = new window.VectorStore(docId);
|
||
await this.vectorStore.init();
|
||
}
|
||
|
||
await this.vectorStore.deleteByDocId(docId);
|
||
this.indexedDocs.delete(docId);
|
||
|
||
console.log(`[SemanticVectorSearch] 已删除文档 ${docId} 的向量索引`);
|
||
}
|
||
|
||
/**
|
||
* 检查索引状态
|
||
*/
|
||
async getIndexStatus(docId) {
|
||
if (!this.vectorStore || this.vectorStore.namespace !== docId) {
|
||
this.vectorStore = new window.VectorStore(docId);
|
||
await this.vectorStore.init();
|
||
}
|
||
|
||
const stats = await this.vectorStore.stats();
|
||
return {
|
||
indexed: stats.count > 0,
|
||
count: stats.count,
|
||
dimensions: stats.dimensions,
|
||
size: (stats.size / 1024).toFixed(2) + ' KB'
|
||
};
|
||
}
|
||
}
|
||
|
||
// 导出全局实例
|
||
window.SemanticVectorSearch = new SemanticVectorSearch();
|
||
|
||
console.log('[SemanticVectorSearch] 意群向量搜索已加载');
|
||
|
||
})(window);
|