paper-burner/js/chatbot/agents/advanced-search-tools.js

382 lines
10 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// js/chatbot/agents/advanced-search-tools.js
// 高级搜索工具:正则表达式搜索、布尔逻辑搜索、模糊搜索
(function(window) {
'use strict';
if (window.AdvancedSearchTools) return;
/**
* 正则表达式搜索
* @param {string} pattern - 正则表达式模式
* @param {string} text - 要搜索的文本
* @param {Object} options - 选项
* @returns {Array} 匹配结果
*/
function regexSearch(pattern, text, options = {}) {
const {
limit = 20,
context = 2000,
caseInsensitive = true,
multiline = true
} = options;
if (!pattern || !text) return [];
const results = [];
let regex;
try {
// 构建正则表达式
let flags = 'g'; // 全局搜索
if (caseInsensitive) flags += 'i';
if (multiline) flags += 'm';
regex = new RegExp(pattern, flags);
} catch (e) {
console.error('[AdvancedSearchTools] 正则表达式语法错误:', e.message);
throw new Error(`正则表达式语法错误: ${e.message}`);
}
let match;
let count = 0;
// 执行正则匹配
while ((match = regex.exec(text)) !== null && count < limit) {
const matchText = match[0];
const matchStart = match.index;
const matchEnd = matchStart + matchText.length;
// 提取上下文
const contextStart = Math.max(0, matchStart - context);
const contextEnd = Math.min(text.length, matchEnd + context);
const snippet = text.slice(contextStart, contextEnd);
results.push({
match: matchText,
matchOffset: matchStart,
matchLength: matchText.length,
preview: snippet,
groups: match.slice(1) // 捕获组
});
count++;
// 防止无限循环(零宽度匹配)
if (match.index === regex.lastIndex) {
regex.lastIndex++;
}
}
return results;
}
/**
* 布尔逻辑搜索
* 支持 AND, OR, NOT, 括号
* 示例: "(CNN OR RNN) AND 对比 NOT 图像"
*/
function booleanSearch(query, text, options = {}) {
const {
limit = 20,
context = 2000,
caseInsensitive = true
} = options;
if (!query || !text) return [];
try {
// 解析布尔查询表达式
const parsedQuery = parseBooleanQuery(query, caseInsensitive);
// 查找所有可能的匹配位置
const matches = findBooleanMatches(parsedQuery, text, caseInsensitive);
// 限制结果数量
const limitedMatches = matches.slice(0, limit);
// 为每个匹配提取上下文
return limitedMatches.map(match => {
const contextStart = Math.max(0, match.position - context);
const contextEnd = Math.min(text.length, match.position + match.length + context);
const snippet = text.slice(contextStart, contextEnd);
return {
matchOffset: match.position,
matchLength: match.length,
preview: snippet,
matchedTerms: match.matchedTerms,
relevanceScore: match.score
};
});
} catch (e) {
console.error('[AdvancedSearchTools] 布尔查询解析错误:', e.message);
throw new Error(`布尔查询语法错误: ${e.message}`);
}
}
/**
* 解析布尔查询表达式
* 简化版:支持 AND, OR, NOT 和括号
*/
function parseBooleanQuery(query, caseInsensitive = true) {
// 标准化查询字符串
let normalized = query
.replace(/\s+AND\s+/gi, ' AND ')
.replace(/\s+OR\s+/gi, ' OR ')
.replace(/\s+NOT\s+/gi, ' NOT ')
.trim();
// 将查询解析为词项和操作符
const tokens = tokenizeBooleanQuery(normalized);
return {
tokens,
caseInsensitive
};
}
/**
* 将布尔查询分词
*/
function tokenizeBooleanQuery(query) {
const tokens = [];
let current = '';
let inQuotes = false;
let i = 0;
while (i < query.length) {
const char = query[i];
if (char === '"') {
inQuotes = !inQuotes;
i++;
continue;
}
if (!inQuotes && (char === '(' || char === ')')) {
if (current.trim()) {
tokens.push({ type: 'term', value: current.trim() });
current = '';
}
tokens.push({ type: char === '(' ? 'lparen' : 'rparen', value: char });
i++;
continue;
}
if (!inQuotes && char === ' ') {
const word = current.trim();
if (word) {
if (word === 'AND' || word === 'OR' || word === 'NOT') {
tokens.push({ type: 'operator', value: word });
} else {
tokens.push({ type: 'term', value: word });
}
current = '';
}
i++;
continue;
}
current += char;
i++;
}
if (current.trim()) {
const word = current.trim();
if (word === 'AND' || word === 'OR' || word === 'NOT') {
tokens.push({ type: 'operator', value: word });
} else {
tokens.push({ type: 'term', value: word });
}
}
return tokens;
}
/**
* 查找满足布尔条件的匹配
*/
function findBooleanMatches(parsedQuery, text, caseInsensitive) {
const { tokens } = parsedQuery;
// 简化实现:先找出所有词项的位置
const termPositions = new Map();
tokens.forEach(token => {
if (token.type === 'term') {
const positions = findTermPositions(token.value, text, caseInsensitive);
termPositions.set(token.value, positions);
}
});
// 评估布尔表达式
const matches = evaluateBooleanExpression(tokens, termPositions, text, caseInsensitive);
// 按位置排序并去重
const uniqueMatches = deduplicateMatches(matches);
uniqueMatches.sort((a, b) => a.position - b.position);
return uniqueMatches;
}
/**
* 查找单个词项在文本中的所有位置
*/
function findTermPositions(term, text, caseInsensitive) {
const positions = [];
const searchText = caseInsensitive ? text.toLowerCase() : text;
const searchTerm = caseInsensitive ? term.toLowerCase() : term;
let pos = 0;
while ((pos = searchText.indexOf(searchTerm, pos)) !== -1) {
positions.push({
start: pos,
end: pos + term.length,
term: term
});
pos += 1;
}
return positions;
}
/**
* 评估布尔表达式
* 简化版:递归下降解析
*/
function evaluateBooleanExpression(tokens, termPositions, text, caseInsensitive) {
// 简化实现:处理常见模式
// 支持: "term1 AND term2", "term1 OR term2", "term1 NOT term2", "(term1 OR term2) AND term3"
const matches = [];
// 提取所有必须包含的词项AND
const mustTerms = [];
const shouldTerms = [];
const notTerms = [];
let currentOperator = 'AND'; // 默认操作符
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
if (token.type === 'operator') {
currentOperator = token.value;
} else if (token.type === 'term') {
if (currentOperator === 'NOT') {
notTerms.push(token.value);
currentOperator = 'AND'; // 重置
} else if (currentOperator === 'OR') {
shouldTerms.push(token.value);
} else {
mustTerms.push(token.value);
}
}
}
// 如果没有 must 词项,将 should 的第一个作为 must
if (mustTerms.length === 0 && shouldTerms.length > 0) {
mustTerms.push(shouldTerms.shift());
}
// 查找同时满足所有条件的位置
if (mustTerms.length === 0) {
return matches;
}
// 以第一个 must 词项为基础
const basePositions = termPositions.get(mustTerms[0]) || [];
basePositions.forEach(basePos => {
let isValid = true;
const matchedTerms = [mustTerms[0]];
let minPos = basePos.start;
let maxPos = basePos.end;
let score = 1;
// 检查其他 must 词项是否在附近(窗口范围内)
const windowSize = 500; // 500字符窗口
for (let i = 1; i < mustTerms.length; i++) {
const term = mustTerms[i];
const positions = termPositions.get(term) || [];
// 在窗口范围内查找
const nearbyPos = positions.find(p =>
Math.abs(p.start - basePos.start) <= windowSize
);
if (!nearbyPos) {
isValid = false;
break;
}
matchedTerms.push(term);
minPos = Math.min(minPos, nearbyPos.start);
maxPos = Math.max(maxPos, nearbyPos.end);
score += 1;
}
// 检查 should 词项(加分项)
shouldTerms.forEach(term => {
const positions = termPositions.get(term) || [];
const nearbyPos = positions.find(p =>
Math.abs(p.start - basePos.start) <= windowSize
);
if (nearbyPos) {
matchedTerms.push(term);
minPos = Math.min(minPos, nearbyPos.start);
maxPos = Math.max(maxPos, nearbyPos.end);
score += 0.5;
}
});
// 检查 not 词项(排除)
notTerms.forEach(term => {
const positions = termPositions.get(term) || [];
const nearbyPos = positions.find(p =>
Math.abs(p.start - basePos.start) <= windowSize
);
if (nearbyPos) {
isValid = false;
}
});
if (isValid) {
matches.push({
position: minPos,
length: maxPos - minPos,
matchedTerms: [...new Set(matchedTerms)],
score
});
}
});
return matches;
}
/**
* 去重匹配结果
*/
function deduplicateMatches(matches) {
const seen = new Set();
return matches.filter(match => {
const key = `${match.position}-${match.length}`;
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
// 导出工具
window.AdvancedSearchTools = {
regexSearch,
booleanSearch
};
console.log('[AdvancedSearchTools] 高级搜索工具已加载');
})(window);