paper-burner/js/chatbot/agents/bm25-search.js

588 lines
18 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// js/chatbot/agents/bm25-search.js
// BM25 检索算法实现(向量搜索的降级方案)
(function(window) {
'use strict';
/**
* BM25 (Best Matching 25) 检索算法
* 基于概率信息检索模型,考虑词频(TF)和逆文档频率(IDF)
* 适合作为向量搜索失败时的降级方案
*/
class BM25Search {
constructor() {
this.index = null;
this.documents = [];
this.avgDocLength = 0;
// BM25参数经验最优值
this.k1 = 1.5; // 词频饱和参数1.2-2.0
this.b = 0.75; // 文档长度归一化参数0.5-0.8
}
/**
* 中文分词n-gram + 完整词保留)
* @param {string} text - 待分词文本
* @returns {Array<string>} 词语数组
*/
tokenize(text) {
if (!text) return [];
// 移除标点符号
const cleaned = text.replace(/[,。!?;:、""''()《》【】\s]+/g, ' ');
const tokens = [];
// 处理中文生成2-gram, 3-gram, 和单字
const chineseChars = cleaned.match(/[\u4e00-\u9fa5]/g) || [];
// 2-gram如"雷曼"
for (let i = 0; i < chineseChars.length - 1; i++) {
tokens.push(chineseChars[i] + chineseChars[i + 1]);
}
// 3-gram如"雷曼公"、"曼公司"
for (let i = 0; i < chineseChars.length - 2; i++) {
tokens.push(chineseChars[i] + chineseChars[i + 1] + chineseChars[i + 2]);
}
// 单字(兜底)
tokens.push(...chineseChars);
// 提取英文单词(转小写)
const englishWords = cleaned.match(/[a-zA-Z]+/g) || [];
tokens.push(...englishWords.map(w => w.toLowerCase()));
// 提取数字
const numbers = cleaned.match(/\d+/g) || [];
tokens.push(...numbers);
// 不去重保留重复以反映真实词频TF
// 若需限制内存,可在此处做频次上限截断(例如每词最多计数 N 次)
return tokens;
}
/**
* 构建BM25索引
* @param {Array<Object>} groups - 意群数组
*/
buildIndex(groups) {
if (!groups || groups.length === 0) {
console.warn('[BM25Search] 输入意群为空');
return;
}
this.documents = groups.map(g => ({
id: g.groupId,
text: this.prepareDocumentText(g),
tokens: [],
length: 0,
metadata: {
summary: g.summary,
keywords: g.keywords,
charCount: g.charCount
}
}));
// 分词
this.documents.forEach(doc => {
doc.tokens = this.tokenize(doc.text);
doc.length = doc.tokens.length;
});
// 计算平均文档长度
this.avgDocLength = this.documents.reduce((sum, doc) => sum + doc.length, 0) / this.documents.length;
// 构建倒排索引
this.index = this.buildInvertedIndex();
console.log(`[BM25Search] 索引构建完成,文档数: ${this.documents.length},平均长度: ${this.avgDocLength.toFixed(1)}`);
}
/**
* 准备文档文本(用于索引)
*/
prepareDocumentText(group) {
const parts = [];
// 关键词权重最高重复3次
if (group.keywords && group.keywords.length > 0) {
const keywordText = group.keywords.join(' ');
parts.push(keywordText, keywordText, keywordText);
}
// 摘要权重次之重复2次
if (group.summary) {
parts.push(group.summary, group.summary);
}
// digest权重正常
if (group.digest) {
parts.push(group.digest.slice(0, 1000)); // 取前1000字
}
// 正文兜底:优先使用 text其次 fullText较低权重单次加入
// 在 chunks 索引中text 即为 chunk 正文;在意群中 fullText 为整段内容
if (group.text && typeof group.text === 'string' && group.text.length > 0) {
parts.push(group.text.slice(0, 1200));
} else if (group.fullText && typeof group.fullText === 'string' && group.fullText.length > 0) {
parts.push(group.fullText.slice(0, 1200));
}
return parts.join(' ');
}
/**
* 构建倒排索引
* @returns {Map} 倒排索引 { term: [{ docIndex, freq }, ...] }
*/
buildInvertedIndex() {
const index = new Map();
this.documents.forEach((doc, docIndex) => {
// 计算词频
const termFreq = new Map();
doc.tokens.forEach(token => {
termFreq.set(token, (termFreq.get(token) || 0) + 1);
});
// 更新倒排索引
termFreq.forEach((freq, term) => {
if (!index.has(term)) {
index.set(term, []);
}
index.get(term).push({ docIndex, freq });
});
});
return index;
}
/**
* 计算IDF (Inverse Document Frequency)
* IDF(q) = log((N - df(q) + 0.5) / (df(q) + 0.5) + 1)
*/
calculateIDF(term) {
const N = this.documents.length;
const df = this.index.has(term) ? this.index.get(term).length : 0;
return Math.log((N - df + 0.5) / (df + 0.5) + 1);
}
/**
* 计算BM25分数
* BM25(D,Q) = Σ IDF(q) * (f(q,D) * (k1 + 1)) / (f(q,D) + k1 * (1 - b + b * |D| / avgdl))
*/
calculateBM25Score(docIndex, queryTerms) {
const doc = this.documents[docIndex];
let score = 0;
queryTerms.forEach(term => {
if (!this.index.has(term)) return;
const idf = this.calculateIDF(term);
// 查找该词在文档中的频率
const postings = this.index.get(term);
const posting = postings.find(p => p.docIndex === docIndex);
if (!posting) return;
const freq = posting.freq;
const docLength = doc.length;
// BM25公式
const numerator = freq * (this.k1 + 1);
const denominator = freq + this.k1 * (1 - this.b + this.b * (docLength / this.avgDocLength));
score += idf * (numerator / denominator);
});
return score;
}
/**
* 搜索
* @param {string} query - 查询文本
* @param {number} topK - 返回top K结果
* @param {number} threshold - 最低分数阈值(可选)
* @returns {Array<{id: string, score: number, metadata: Object}>}
*/
search(query, topK = 5, threshold = 0) {
if (!this.index || this.documents.length === 0) {
console.warn('[BM25Search] 索引未构建');
return [];
}
// 分词
const queryTerms = this.tokenize(query);
if (queryTerms.length === 0) {
console.warn('[BM25Search] 查询为空');
return [];
}
console.log(`[BM25Search] 查询词: ${queryTerms.slice(0, 10).join(', ')}${queryTerms.length > 10 ? '...' : ''}`);
// 计算所有文档的BM25分数
const scores = this.documents.map((doc, docIndex) => ({
id: doc.id,
score: this.calculateBM25Score(docIndex, queryTerms),
metadata: doc.metadata
}));
// 过滤低分结果
const filtered = scores.filter(s => s.score > threshold);
// 排序并返回topK
filtered.sort((a, b) => b.score - a.score);
const results = filtered.slice(0, topK);
console.log(`[BM25Search] 返回 ${results.length} 个结果,分数范围: ${results[0]?.score.toFixed(3)} - ${results[results.length - 1]?.score.toFixed(3)}`);
return results;
}
/**
* 关键词精确搜索使用n-gram分词+短语匹配加权)
* @param {Array<string>} keywords - 关键词数组
* @param {number} topK - 返回结果数
* @param {number} threshold - 最低分数阈值
* @returns {Array<{id: string, score: number, metadata: Object}>}
*/
searchKeywords(keywords, topK = 5, threshold = 0) {
if (!this.index || this.documents.length === 0) {
console.warn('[BM25Search] 索引未构建');
return [];
}
if (!Array.isArray(keywords) || keywords.length === 0) {
console.warn('[BM25Search] 关键词为空');
return [];
}
const originalKeywords = keywords.filter(kw => kw && typeof kw === 'string' && kw.trim());
// 对每个关键词生成n-gram查询词
const queryTerms = keywords.flatMap(kw => {
if (!kw || typeof kw !== 'string') return [];
const cleaned = kw.trim();
if (!cleaned) return [];
const terms = [];
// 提取中文部分生成2-gram和3-gram
const chineseChars = cleaned.match(/[\u4e00-\u9fa5]/g) || [];
if (chineseChars.length > 0) {
// 2-gram
for (let i = 0; i < chineseChars.length - 1; i++) {
terms.push(chineseChars[i] + chineseChars[i + 1]);
}
// 3-gram
for (let i = 0; i < chineseChars.length - 2; i++) {
terms.push(chineseChars[i] + chineseChars[i + 1] + chineseChars[i + 2]);
}
// 单字
terms.push(...chineseChars);
}
// 提取英文部分(转小写)
const englishMatches = cleaned.match(/[a-zA-Z]+/g) || [];
terms.push(...englishMatches.map(w => w.toLowerCase()));
// 提取数字部分
const numberMatches = cleaned.match(/\d+/g) || [];
terms.push(...numberMatches);
return terms;
});
if (queryTerms.length === 0) {
console.warn('[BM25Search] 关键词处理后为空');
return [];
}
console.log(`[BM25Search-Keywords] 原始关键词: ${originalKeywords.join(', ')}`);
console.log(`[BM25Search-Keywords] 分词后查询词(前10个): ${[...new Set(queryTerms)].slice(0, 10).join(', ')}${queryTerms.length > 10 ? '...' : ''}`);
// 计算所有文档的BM25分数
const scores = this.documents.map((doc, docIndex) => {
const bm25Score = this.calculateBM25Score(docIndex, queryTerms);
// 短语匹配加权:检查原文是否包含完整关键词
let phraseBoost = 1.0;
for (const keyword of originalKeywords) {
if (doc.text.includes(keyword)) {
phraseBoost *= 3.0; // 包含完整短语分数×3.0
}
}
return {
id: doc.id,
score: bm25Score * phraseBoost,
metadata: doc.metadata,
_phraseBoost: phraseBoost // 用于debug
};
});
// 过滤低分结果
const filtered = scores.filter(s => s.score > threshold);
// 排序并返回topK
filtered.sort((a, b) => b.score - a.score);
const results = filtered.slice(0, topK);
if (results.length > 0) {
const boostedCount = results.filter(r => r._phraseBoost > 1).length;
console.log(`[BM25Search-Keywords] 返回 ${results.length} 个结果,分数范围: ${results[0]?.score.toFixed(3)} - ${results[results.length - 1]?.score.toFixed(3)}${boostedCount > 0 ? ` (${boostedCount}个短语加权)` : ''}`);
} else {
console.log(`[BM25Search-Keywords] 未找到匹配结果`);
}
// 移除debug字段
results.forEach(r => delete r._phraseBoost);
return results;
}
/**
* 获取索引统计信息
*/
getStats() {
if (!this.index) return null;
return {
documentCount: this.documents.length,
termCount: this.index.size,
avgDocLength: this.avgDocLength.toFixed(1),
totalTokens: this.documents.reduce((sum, doc) => sum + doc.length, 0)
};
}
/**
* 清空索引
*/
clear() {
this.index = null;
this.documents = [];
this.avgDocLength = 0;
console.log('[BM25Search] 索引已清空');
}
}
/**
* 意群BM25搜索引擎集成到现有系统
*/
class SemanticBM25Search {
constructor() {
this.bm25 = new BM25Search();
this.indexedDocId = null;
}
/**
* 为意群建立BM25索引
*/
indexGroups(groups, docId) {
if (!groups || groups.length === 0) {
console.warn('[SemanticBM25Search] 意群为空');
return;
}
this.bm25.buildIndex(groups);
this.indexedDocId = docId;
console.log('[SemanticBM25Search] BM25索引建立完成');
}
/**
* 为chunks建立BM25索引
*/
indexChunks(chunks, docId) {
if (!chunks || chunks.length === 0) {
console.warn('[SemanticBM25Search] chunks为空');
return;
}
// 将chunks转换为类似意群的结构
const chunkDocs = chunks.map(c => ({
groupId: c.chunkId, // 使用chunkId作为id
text: c.text,
summary: c.text.substring(0, 150), // 前150字作为摘要
keywords: [], // chunks没有keywords
charCount: c.charCount
}));
this.bm25.buildIndex(chunkDocs);
this.indexedDocId = docId;
console.log(`[SemanticBM25Search] BM25索引建立完成${chunks.length} 个chunks`);
}
/**
* 搜索chunks返回完整chunk对象
*/
searchChunks(query, chunks, options = {}) {
const { topK = 10, threshold = 0.1 } = options;
// 检查索引是否存在
if (!this.bm25.index || this.bm25.documents.length === 0) {
console.warn('[SemanticBM25Search] 索引未建立,现在建立...');
const docId = this.getCurrentDocId();
this.indexChunks(chunks, docId);
}
// BM25搜索
const results = this.bm25.search(query, topK, threshold);
// 从chunks中找出对应的完整对象
const chunkMap = new Map(chunks.map(c => [c.chunkId, c]));
const matchedChunks = results
.map(r => {
const chunk = chunkMap.get(r.id);
if (chunk) {
return {
...chunk,
score: r.score
};
}
return null;
})
.filter(Boolean);
return matchedChunks;
}
/**
* 关键词精确搜索chunks返回完整chunk对象不做n-gram拆分
*/
searchChunksKeywords(keywords, chunks, options = {}) {
const { topK = 10, threshold = 0.0 } = options; // 放宽阈值,尽量召回
// 即时短语优先匹配(无需索引):直接在 chunk 文本里找完整关键词
const normalizedKeywords = (Array.isArray(keywords) ? keywords : [String(keywords || '')])
.map(k => (k || '').trim())
.filter(Boolean);
const phraseHits = [];
if (normalizedKeywords.length > 0 && Array.isArray(chunks)) {
for (const chunk of chunks) {
const text = String(chunk.text || '');
if (!text) continue;
const matched = normalizedKeywords.filter(kw => kw && text.includes(kw));
if (matched.length > 0) {
// 简单打分:完整短语优先,按匹配个数和最早出现位置加权
const firstPos = Math.min(...matched.map(kw => Math.max(0, text.indexOf(kw))));
const score = 1000 + matched.length * 10 - Math.floor(firstPos / 50);
phraseHits.push({
...chunk,
score,
_matchedKeywords: matched
});
}
}
phraseHits.sort((a, b) => b.score - a.score);
}
// 若短语命中已有足量结果,优先返回
if (phraseHits.length > 0) {
return phraseHits.slice(0, topK).map(hit => ({
...hit,
matchedKeywords: hit._matchedKeywords
}));
}
// 短语无命中则走 BM25需要索引
if (!this.bm25.index || this.bm25.documents.length === 0) {
console.warn('[SemanticBM25Search] 索引未建立,现在建立...');
const docId = this.getCurrentDocId();
this.indexChunks(chunks, docId);
}
const results = this.bm25.searchKeywords(normalizedKeywords, topK, threshold);
const chunkMap = new Map(chunks.map(c => [c.chunkId, c]));
const matchedChunks = results
.map(r => {
const chunk = chunkMap.get(r.id);
if (chunk) {
// 标注匹配关键词便于UI展示
const text = String(chunk.text || '');
const matched = normalizedKeywords.filter(kw => kw && text.includes(kw));
return {
...chunk,
score: r.score,
matchedKeywords: matched
};
}
return null;
})
.filter(Boolean);
return matchedChunks;
}
/**
* 搜索意群(返回完整意群对象)
*/
search(query, groups, options = {}) {
const { topK = 8, threshold = 0.1 } = options;
// 检查索引是否存在
if (!this.bm25.index || this.bm25.documents.length === 0) {
console.warn('[SemanticBM25Search] 索引未建立,现在建立...');
const docId = this.getCurrentDocId();
this.indexGroups(groups, docId);
}
// BM25搜索
const results = this.bm25.search(query, topK, threshold);
// 从groups中找出对应的完整对象
const groupMap = new Map(groups.map(g => [g.groupId, g]));
const matchedGroups = results
.map(r => groupMap.get(r.id))
.filter(Boolean);
return matchedGroups;
}
/**
* 获取当前文档ID
*/
getCurrentDocId() {
if (window.ChatbotCore?.getCurrentDocId) {
return window.ChatbotCore.getCurrentDocId();
}
if (window.data?.id) {
return window.data.id;
}
return 'default';
}
/**
* 清空索引
*/
clear() {
this.bm25.clear();
this.indexedDocId = null;
}
/**
* 获取统计信息
*/
getStats() {
return this.bm25.getStats();
}
}
// 导出
window.BM25Search = BM25Search;
window.SemanticBM25Search = new SemanticBM25Search();
console.log('[BM25Search] BM25检索引擎已加载');
})(window);