// js/chatbot/agents/bm25-search.js // BM25 检索算法实现(向量搜索的降级方案) (function(window) { 'use strict'; /** * BM25 (Best Matching 25) 检索算法 * 基于概率信息检索模型,考虑词频(TF)和逆文档频率(IDF) * 适合作为向量搜索失败时的降级方案 */ class BM25Search { constructor() { this.index = null; this.documents = []; this.avgDocLength = 0; // BM25参数(经验最优值) this.k1 = 1.5; // 词频饱和参数(1.2-2.0) this.b = 0.75; // 文档长度归一化参数(0.5-0.8) } /** * 中文分词(n-gram + 完整词保留) * @param {string} text - 待分词文本 * @returns {Array} 词语数组 */ tokenize(text) { if (!text) return []; // 移除标点符号 const cleaned = text.replace(/[,。!?;:、""''()《》【】\s]+/g, ' '); const tokens = []; // 处理中文:生成2-gram, 3-gram, 和单字 const chineseChars = cleaned.match(/[\u4e00-\u9fa5]/g) || []; // 2-gram(如"雷曼") for (let i = 0; i < chineseChars.length - 1; i++) { tokens.push(chineseChars[i] + chineseChars[i + 1]); } // 3-gram(如"雷曼公"、"曼公司") for (let i = 0; i < chineseChars.length - 2; i++) { tokens.push(chineseChars[i] + chineseChars[i + 1] + chineseChars[i + 2]); } // 单字(兜底) tokens.push(...chineseChars); // 提取英文单词(转小写) const englishWords = cleaned.match(/[a-zA-Z]+/g) || []; tokens.push(...englishWords.map(w => w.toLowerCase())); // 提取数字 const numbers = cleaned.match(/\d+/g) || []; tokens.push(...numbers); // 不去重:保留重复以反映真实词频(TF) // 若需限制内存,可在此处做频次上限截断(例如每词最多计数 N 次) return tokens; } /** * 构建BM25索引 * @param {Array} groups - 意群数组 */ buildIndex(groups) { if (!groups || groups.length === 0) { console.warn('[BM25Search] 输入意群为空'); return; } this.documents = groups.map(g => ({ id: g.groupId, text: this.prepareDocumentText(g), tokens: [], length: 0, metadata: { summary: g.summary, keywords: g.keywords, charCount: g.charCount } })); // 分词 this.documents.forEach(doc => { doc.tokens = this.tokenize(doc.text); doc.length = doc.tokens.length; }); // 计算平均文档长度 this.avgDocLength = this.documents.reduce((sum, doc) => sum + doc.length, 0) / this.documents.length; // 构建倒排索引 this.index = this.buildInvertedIndex(); console.log(`[BM25Search] 索引构建完成,文档数: ${this.documents.length},平均长度: ${this.avgDocLength.toFixed(1)}`); } /** * 准备文档文本(用于索引) */ prepareDocumentText(group) { const parts = []; // 关键词(权重最高,重复3次) if (group.keywords && group.keywords.length > 0) { const keywordText = group.keywords.join(' '); parts.push(keywordText, keywordText, keywordText); } // 摘要(权重次之,重复2次) if (group.summary) { parts.push(group.summary, group.summary); } // digest(权重正常) if (group.digest) { parts.push(group.digest.slice(0, 1000)); // 取前1000字 } // 正文兜底:优先使用 text,其次 fullText(较低权重,单次加入) // 在 chunks 索引中,text 即为 chunk 正文;在意群中 fullText 为整段内容 if (group.text && typeof group.text === 'string' && group.text.length > 0) { parts.push(group.text.slice(0, 1200)); } else if (group.fullText && typeof group.fullText === 'string' && group.fullText.length > 0) { parts.push(group.fullText.slice(0, 1200)); } return parts.join(' '); } /** * 构建倒排索引 * @returns {Map} 倒排索引 { term: [{ docIndex, freq }, ...] } */ buildInvertedIndex() { const index = new Map(); this.documents.forEach((doc, docIndex) => { // 计算词频 const termFreq = new Map(); doc.tokens.forEach(token => { termFreq.set(token, (termFreq.get(token) || 0) + 1); }); // 更新倒排索引 termFreq.forEach((freq, term) => { if (!index.has(term)) { index.set(term, []); } index.get(term).push({ docIndex, freq }); }); }); return index; } /** * 计算IDF (Inverse Document Frequency) * IDF(q) = log((N - df(q) + 0.5) / (df(q) + 0.5) + 1) */ calculateIDF(term) { const N = this.documents.length; const df = this.index.has(term) ? this.index.get(term).length : 0; return Math.log((N - df + 0.5) / (df + 0.5) + 1); } /** * 计算BM25分数 * BM25(D,Q) = Σ IDF(q) * (f(q,D) * (k1 + 1)) / (f(q,D) + k1 * (1 - b + b * |D| / avgdl)) */ calculateBM25Score(docIndex, queryTerms) { const doc = this.documents[docIndex]; let score = 0; queryTerms.forEach(term => { if (!this.index.has(term)) return; const idf = this.calculateIDF(term); // 查找该词在文档中的频率 const postings = this.index.get(term); const posting = postings.find(p => p.docIndex === docIndex); if (!posting) return; const freq = posting.freq; const docLength = doc.length; // BM25公式 const numerator = freq * (this.k1 + 1); const denominator = freq + this.k1 * (1 - this.b + this.b * (docLength / this.avgDocLength)); score += idf * (numerator / denominator); }); return score; } /** * 搜索 * @param {string} query - 查询文本 * @param {number} topK - 返回top K结果 * @param {number} threshold - 最低分数阈值(可选) * @returns {Array<{id: string, score: number, metadata: Object}>} */ search(query, topK = 5, threshold = 0) { if (!this.index || this.documents.length === 0) { console.warn('[BM25Search] 索引未构建'); return []; } // 分词 const queryTerms = this.tokenize(query); if (queryTerms.length === 0) { console.warn('[BM25Search] 查询为空'); return []; } console.log(`[BM25Search] 查询词: ${queryTerms.slice(0, 10).join(', ')}${queryTerms.length > 10 ? '...' : ''}`); // 计算所有文档的BM25分数 const scores = this.documents.map((doc, docIndex) => ({ id: doc.id, score: this.calculateBM25Score(docIndex, queryTerms), metadata: doc.metadata })); // 过滤低分结果 const filtered = scores.filter(s => s.score > threshold); // 排序并返回topK filtered.sort((a, b) => b.score - a.score); const results = filtered.slice(0, topK); console.log(`[BM25Search] 返回 ${results.length} 个结果,分数范围: ${results[0]?.score.toFixed(3)} - ${results[results.length - 1]?.score.toFixed(3)}`); return results; } /** * 关键词精确搜索(使用n-gram分词+短语匹配加权) * @param {Array} keywords - 关键词数组 * @param {number} topK - 返回结果数 * @param {number} threshold - 最低分数阈值 * @returns {Array<{id: string, score: number, metadata: Object}>} */ searchKeywords(keywords, topK = 5, threshold = 0) { if (!this.index || this.documents.length === 0) { console.warn('[BM25Search] 索引未构建'); return []; } if (!Array.isArray(keywords) || keywords.length === 0) { console.warn('[BM25Search] 关键词为空'); return []; } const originalKeywords = keywords.filter(kw => kw && typeof kw === 'string' && kw.trim()); // 对每个关键词生成n-gram查询词 const queryTerms = keywords.flatMap(kw => { if (!kw || typeof kw !== 'string') return []; const cleaned = kw.trim(); if (!cleaned) return []; const terms = []; // 提取中文部分,生成2-gram和3-gram const chineseChars = cleaned.match(/[\u4e00-\u9fa5]/g) || []; if (chineseChars.length > 0) { // 2-gram for (let i = 0; i < chineseChars.length - 1; i++) { terms.push(chineseChars[i] + chineseChars[i + 1]); } // 3-gram for (let i = 0; i < chineseChars.length - 2; i++) { terms.push(chineseChars[i] + chineseChars[i + 1] + chineseChars[i + 2]); } // 单字 terms.push(...chineseChars); } // 提取英文部分(转小写) const englishMatches = cleaned.match(/[a-zA-Z]+/g) || []; terms.push(...englishMatches.map(w => w.toLowerCase())); // 提取数字部分 const numberMatches = cleaned.match(/\d+/g) || []; terms.push(...numberMatches); return terms; }); if (queryTerms.length === 0) { console.warn('[BM25Search] 关键词处理后为空'); return []; } console.log(`[BM25Search-Keywords] 原始关键词: ${originalKeywords.join(', ')}`); console.log(`[BM25Search-Keywords] 分词后查询词(前10个): ${[...new Set(queryTerms)].slice(0, 10).join(', ')}${queryTerms.length > 10 ? '...' : ''}`); // 计算所有文档的BM25分数 const scores = this.documents.map((doc, docIndex) => { const bm25Score = this.calculateBM25Score(docIndex, queryTerms); // 短语匹配加权:检查原文是否包含完整关键词 let phraseBoost = 1.0; for (const keyword of originalKeywords) { if (doc.text.includes(keyword)) { phraseBoost *= 3.0; // 包含完整短语,分数×3.0 } } return { id: doc.id, score: bm25Score * phraseBoost, metadata: doc.metadata, _phraseBoost: phraseBoost // 用于debug }; }); // 过滤低分结果 const filtered = scores.filter(s => s.score > threshold); // 排序并返回topK filtered.sort((a, b) => b.score - a.score); const results = filtered.slice(0, topK); if (results.length > 0) { const boostedCount = results.filter(r => r._phraseBoost > 1).length; console.log(`[BM25Search-Keywords] 返回 ${results.length} 个结果,分数范围: ${results[0]?.score.toFixed(3)} - ${results[results.length - 1]?.score.toFixed(3)}${boostedCount > 0 ? ` (${boostedCount}个短语加权)` : ''}`); } else { console.log(`[BM25Search-Keywords] 未找到匹配结果`); } // 移除debug字段 results.forEach(r => delete r._phraseBoost); return results; } /** * 获取索引统计信息 */ getStats() { if (!this.index) return null; return { documentCount: this.documents.length, termCount: this.index.size, avgDocLength: this.avgDocLength.toFixed(1), totalTokens: this.documents.reduce((sum, doc) => sum + doc.length, 0) }; } /** * 清空索引 */ clear() { this.index = null; this.documents = []; this.avgDocLength = 0; console.log('[BM25Search] 索引已清空'); } } /** * 意群BM25搜索引擎(集成到现有系统) */ class SemanticBM25Search { constructor() { this.bm25 = new BM25Search(); this.indexedDocId = null; } /** * 为意群建立BM25索引 */ indexGroups(groups, docId) { if (!groups || groups.length === 0) { console.warn('[SemanticBM25Search] 意群为空'); return; } this.bm25.buildIndex(groups); this.indexedDocId = docId; console.log('[SemanticBM25Search] BM25索引建立完成'); } /** * 为chunks建立BM25索引 */ indexChunks(chunks, docId) { if (!chunks || chunks.length === 0) { console.warn('[SemanticBM25Search] chunks为空'); return; } // 将chunks转换为类似意群的结构 const chunkDocs = chunks.map(c => ({ groupId: c.chunkId, // 使用chunkId作为id text: c.text, summary: c.text.substring(0, 150), // 前150字作为摘要 keywords: [], // chunks没有keywords charCount: c.charCount })); this.bm25.buildIndex(chunkDocs); this.indexedDocId = docId; console.log(`[SemanticBM25Search] BM25索引建立完成,共 ${chunks.length} 个chunks`); } /** * 搜索chunks(返回完整chunk对象) */ searchChunks(query, chunks, options = {}) { const { topK = 10, threshold = 0.1 } = options; // 检查索引是否存在 if (!this.bm25.index || this.bm25.documents.length === 0) { console.warn('[SemanticBM25Search] 索引未建立,现在建立...'); const docId = this.getCurrentDocId(); this.indexChunks(chunks, docId); } // BM25搜索 const results = this.bm25.search(query, topK, threshold); // 从chunks中找出对应的完整对象 const chunkMap = new Map(chunks.map(c => [c.chunkId, c])); const matchedChunks = results .map(r => { const chunk = chunkMap.get(r.id); if (chunk) { return { ...chunk, score: r.score }; } return null; }) .filter(Boolean); return matchedChunks; } /** * 关键词精确搜索chunks(返回完整chunk对象,不做n-gram拆分) */ searchChunksKeywords(keywords, chunks, options = {}) { const { topK = 10, threshold = 0.0 } = options; // 放宽阈值,尽量召回 // 即时短语优先匹配(无需索引):直接在 chunk 文本里找完整关键词 const normalizedKeywords = (Array.isArray(keywords) ? keywords : [String(keywords || '')]) .map(k => (k || '').trim()) .filter(Boolean); const phraseHits = []; if (normalizedKeywords.length > 0 && Array.isArray(chunks)) { for (const chunk of chunks) { const text = String(chunk.text || ''); if (!text) continue; const matched = normalizedKeywords.filter(kw => kw && text.includes(kw)); if (matched.length > 0) { // 简单打分:完整短语优先,按匹配个数和最早出现位置加权 const firstPos = Math.min(...matched.map(kw => Math.max(0, text.indexOf(kw)))); const score = 1000 + matched.length * 10 - Math.floor(firstPos / 50); phraseHits.push({ ...chunk, score, _matchedKeywords: matched }); } } phraseHits.sort((a, b) => b.score - a.score); } // 若短语命中已有足量结果,优先返回 if (phraseHits.length > 0) { return phraseHits.slice(0, topK).map(hit => ({ ...hit, matchedKeywords: hit._matchedKeywords })); } // 短语无命中则走 BM25(需要索引) if (!this.bm25.index || this.bm25.documents.length === 0) { console.warn('[SemanticBM25Search] 索引未建立,现在建立...'); const docId = this.getCurrentDocId(); this.indexChunks(chunks, docId); } const results = this.bm25.searchKeywords(normalizedKeywords, topK, threshold); const chunkMap = new Map(chunks.map(c => [c.chunkId, c])); const matchedChunks = results .map(r => { const chunk = chunkMap.get(r.id); if (chunk) { // 标注匹配关键词(便于UI展示) const text = String(chunk.text || ''); const matched = normalizedKeywords.filter(kw => kw && text.includes(kw)); return { ...chunk, score: r.score, matchedKeywords: matched }; } return null; }) .filter(Boolean); return matchedChunks; } /** * 搜索意群(返回完整意群对象) */ search(query, groups, options = {}) { const { topK = 8, threshold = 0.1 } = options; // 检查索引是否存在 if (!this.bm25.index || this.bm25.documents.length === 0) { console.warn('[SemanticBM25Search] 索引未建立,现在建立...'); const docId = this.getCurrentDocId(); this.indexGroups(groups, docId); } // BM25搜索 const results = this.bm25.search(query, topK, threshold); // 从groups中找出对应的完整对象 const groupMap = new Map(groups.map(g => [g.groupId, g])); const matchedGroups = results .map(r => groupMap.get(r.id)) .filter(Boolean); return matchedGroups; } /** * 获取当前文档ID */ getCurrentDocId() { if (window.ChatbotCore?.getCurrentDocId) { return window.ChatbotCore.getCurrentDocId(); } if (window.data?.id) { return window.data.id; } return 'default'; } /** * 清空索引 */ clear() { this.bm25.clear(); this.indexedDocId = null; } /** * 获取统计信息 */ getStats() { return this.bm25.getStats(); } } // 导出 window.BM25Search = BM25Search; window.SemanticBM25Search = new SemanticBM25Search(); console.log('[BM25Search] BM25检索引擎已加载'); })(window);