paper-burner/js/process/glossary-matcher.js

233 lines
6.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// process/glossary-matcher.js
/**
* 高效多模式术语匹配器
* 使用改进的 Aho-Corasick 算法,时间复杂度 O(L + m)
* L = 文本长度m = 匹配数
*/
class GlossaryMatcher {
constructor() {
this.root = { children: new Map(), entries: [] };
this.caseInsensitiveRoot = { children: new Map(), entries: [] };
this.hasWholeWordEntries = false;
}
/**
* 构建 Trie 树
* @param {Array} entries - 术语条目数组
*/
build(entries) {
this.root = { children: new Map(), entries: [] };
this.caseInsensitiveRoot = { children: new Map(), entries: [] };
this.hasWholeWordEntries = false;
for (const entry of entries) {
if (!entry.term || !entry.translation) continue;
const root = entry.caseSensitive ? this.root : this.caseInsensitiveRoot;
const term = entry.caseSensitive ? entry.term : entry.term.toLowerCase();
let node = root;
for (const char of term) {
if (!node.children.has(char)) {
node.children.set(char, { children: new Map(), entries: [] });
}
node = node.children.get(char);
}
node.entries.push({
term: entry.term,
translation: entry.translation,
wholeWord: !!entry.wholeWord,
caseSensitive: !!entry.caseSensitive
});
if (entry.wholeWord) {
this.hasWholeWordEntries = true;
}
}
}
/**
* 检查是否为单词边界
*/
_isWordBoundary(text, index) {
if (index < 0 || index >= text.length) return true;
const char = text[index];
return !/[A-Za-z0-9_]/.test(char);
}
/**
* 检查位置是否有 ASCII 字符(用于判断是否需要边界检查)
*/
_hasAsciiWord(str) {
return /[A-Za-z0-9_]/.test(str);
}
/**
* 在文本中查找所有匹配
* @param {string} text - 要搜索的文本
* @returns {Array} 匹配的术语列表
*/
findMatches(text) {
if (!text) return [];
const matches = [];
const seen = new Set();
const textLower = text.toLowerCase();
// 遍历文本的每个位置
for (let i = 0; i < text.length; i++) {
// 尝试大小写敏感匹配
this._searchFromPosition(text, i, this.root, true, matches, seen);
// 尝试大小写不敏感匹配
this._searchFromPosition(textLower, i, this.caseInsensitiveRoot, false, matches, seen);
}
return matches;
}
/**
* 从指定位置开始搜索
*/
_searchFromPosition(text, startPos, root, caseSensitive, matches, seen) {
let node = root;
let pos = startPos;
while (pos < text.length) {
const char = text[pos];
if (!node.children.has(char)) {
break;
}
node = node.children.get(char);
pos++;
// 检查当前节点是否有完整术语
if (node.entries.length > 0) {
const matchedText = text.substring(startPos, pos);
for (const entry of node.entries) {
// 检查大小写是否匹配
if (entry.caseSensitive !== caseSensitive) continue;
// 如果需要全词匹配,检查边界
if (entry.wholeWord && this._hasAsciiWord(entry.term)) {
if (!this._isWordBoundary(text, startPos - 1) ||
!this._isWordBoundary(text, pos)) {
continue;
}
}
const key = `${entry.term}=>${entry.translation}`;
if (!seen.has(key)) {
matches.push({
term: entry.term,
translation: entry.translation
});
seen.add(key);
}
}
}
}
}
/**
* 快速检查文本是否包含任何术语(用于预过滤)
*/
hasAnyMatch(text) {
if (!text) return false;
const textLower = text.toLowerCase();
// 快速检查:遍历到第一个匹配就返回
for (let i = 0; i < text.length; i++) {
if (this._hasMatchAtPosition(text, i, this.root, true)) return true;
if (this._hasMatchAtPosition(textLower, i, this.caseInsensitiveRoot, false)) return true;
}
return false;
}
_hasMatchAtPosition(text, startPos, root, caseSensitive) {
let node = root;
let pos = startPos;
while (pos < text.length) {
const char = text[pos];
if (!node.children.has(char)) break;
node = node.children.get(char);
pos++;
if (node.entries.length > 0) {
// 找到至少一个匹配(不检查全词边界,快速返回)
for (const entry of node.entries) {
if (entry.caseSensitive === caseSensitive) {
if (!entry.wholeWord) return true;
// 检查全词边界
if (this._hasAsciiWord(entry.term)) {
if (this._isWordBoundary(text, startPos - 1) &&
this._isWordBoundary(text, pos)) {
return true;
}
} else {
return true;
}
}
}
}
}
return false;
}
}
// 全局实例和缓存
let _globalMatcher = null;
let _matcherVersion = 0;
/**
* 获取或创建全局匹配器实例
*/
function getOrCreateMatcher() {
if (typeof loadGlossaryEntries !== 'function') {
return new GlossaryMatcher();
}
const allEntries = loadGlossaryEntries();
const enabledEntries = allEntries.filter(e => e && e.enabled && e.term && e.translation);
// 简单版本控制:长度 + 前几个术语的哈希
const versionHash = enabledEntries.length +
enabledEntries.slice(0, 10).map(e => e.term).join('|');
if (_globalMatcher && _matcherVersion === versionHash) {
return _globalMatcher;
}
const matcher = new GlossaryMatcher();
matcher.build(enabledEntries);
_globalMatcher = matcher;
_matcherVersion = versionHash;
return matcher;
}
// 导出函数供旧 API 兼容
function findGlossaryMatchesFast(text) {
const matcher = getOrCreateMatcher();
return matcher.findMatches(text);
}
// 挂载到全局
if (typeof window !== 'undefined') {
window.GlossaryMatcher = GlossaryMatcher;
window.getGlossaryMatcher = getOrCreateMatcher;
window.findGlossaryMatchesFast = findGlossaryMatchesFast;
}