310 lines
10 KiB
JavaScript
310 lines
10 KiB
JavaScript
// process/glossary.js
|
||
|
||
/**
|
||
* 翻译备择库(术语库)匹配与注入工具。
|
||
* 提供:
|
||
* - 获取启用的术语条目
|
||
* - 在给定文本中匹配命中条目
|
||
* - 构建注入到系统提示词中的术语翻译指引
|
||
*/
|
||
|
||
// 缓存编译好的正则表达式
|
||
let _glossaryRegexCache = null;
|
||
let _glossaryRegexVersion = 0;
|
||
|
||
function _loadEnabledGlossaryEntries() {
|
||
if (typeof loadGlossaryEntries !== 'function') return [];
|
||
const all = loadGlossaryEntries();
|
||
return all.filter(e => e && e.enabled && e.term && e.translation);
|
||
}
|
||
|
||
function _buildRegexMapForEntries(entries) {
|
||
const map = new Map();
|
||
for (const e of entries) {
|
||
const re = _buildRegexForEntry(e.term, !!e.wholeWord, !!e.caseSensitive);
|
||
if (re) {
|
||
map.set(e, re);
|
||
}
|
||
}
|
||
return map;
|
||
}
|
||
|
||
function _getOrBuildRegexMap() {
|
||
const entries = _loadEnabledGlossaryEntries();
|
||
const currentVersion = entries.length + entries.map(e => e.term).join('|');
|
||
|
||
if (_glossaryRegexCache && _glossaryRegexVersion === currentVersion) {
|
||
return _glossaryRegexCache;
|
||
}
|
||
|
||
_glossaryRegexCache = _buildRegexMapForEntries(entries);
|
||
_glossaryRegexVersion = currentVersion;
|
||
return _glossaryRegexCache;
|
||
}
|
||
|
||
function _buildRegexForEntry(term, wholeWord, caseSensitive) {
|
||
// 基本转义
|
||
const esc = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||
const hasAsciiWord = /[A-Za-z0-9_]/.test(term);
|
||
const source = esc(term);
|
||
let pattern = source;
|
||
if (wholeWord && hasAsciiWord) {
|
||
// 英文等使用 \b 边界;对中文不加边界
|
||
pattern = `\\b${source}\\b`;
|
||
}
|
||
const flags = caseSensitive ? 'g' : 'gi';
|
||
try {
|
||
return new RegExp(pattern, flags);
|
||
} catch (e) {
|
||
console.warn('Invalid glossary regex from term:', term, e);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 在文本中查找命中的术语条目。
|
||
* 使用高效的 Trie 树多模式匹配算法(O(L+m) 复杂度)
|
||
* @param {string} text
|
||
* @returns {Array<{term:string, translation:string}>}
|
||
*/
|
||
function getGlossaryMatchesForText(text) {
|
||
if (!text) return [];
|
||
|
||
// 优先使用新的高效匹配器
|
||
if (typeof findGlossaryMatchesFast === 'function') {
|
||
const matches = findGlossaryMatchesFast(text);
|
||
return _filterAndSortMatches(matches, text);
|
||
}
|
||
|
||
// 降级到正则表达式方案
|
||
const regexMap = _getOrBuildRegexMap();
|
||
const matched = [];
|
||
const seen = new Set();
|
||
|
||
for (const [entry, regex] of regexMap) {
|
||
if (regex.test(text)) {
|
||
const key = `${entry.term}=>${entry.translation}`;
|
||
if (!seen.has(key)) {
|
||
matched.push({ term: entry.term, translation: entry.translation });
|
||
seen.add(key);
|
||
}
|
||
}
|
||
}
|
||
return _filterAndSortMatches(matched, text);
|
||
}
|
||
|
||
/**
|
||
* 检查是否应该应用智能过滤
|
||
* 如果至少有一个启用的术语库开启了智能过滤,则返回 true
|
||
* @returns {boolean}
|
||
*/
|
||
function _shouldApplySmartFilter() {
|
||
if (typeof loadGlossarySets !== 'function') return true; // 默认启用
|
||
|
||
const sets = loadGlossarySets();
|
||
const setIds = Object.keys(sets || {});
|
||
|
||
// 如果没有术语库,默认启用过滤
|
||
if (setIds.length === 0) return true;
|
||
|
||
// 检查是否至少有一个启用的术语库开启了智能过滤
|
||
for (const id of setIds) {
|
||
const set = sets[id];
|
||
if (set && set.enabled) {
|
||
// enableSmartFilter 默认为 true(未定义时)
|
||
if (set.enableSmartFilter !== false) {
|
||
return true;
|
||
}
|
||
}
|
||
}
|
||
|
||
// 所有启用的术语库都禁用了智能过滤
|
||
return false;
|
||
}
|
||
|
||
/**
|
||
* 过滤和排序匹配结果,优先保留重要术语
|
||
* @param {Array} matches - 原始匹配结果
|
||
* @returns {Array} 过滤和排序后的匹配结果
|
||
*/
|
||
function _filterAndSortMatches(matches) {
|
||
if (!matches || matches.length === 0) return [];
|
||
|
||
// 检查是否需要智能过滤
|
||
const shouldApplySmartFilter = _shouldApplySmartFilter();
|
||
|
||
if (!shouldApplySmartFilter) {
|
||
// 不应用智能过滤,只做简单排序
|
||
return matches.slice().sort((a, b) => {
|
||
return (b.term || '').length - (a.term || '').length;
|
||
});
|
||
}
|
||
|
||
// 应用智能过滤
|
||
// 常见通用词黑名单(这些词不应该作为专业术语)
|
||
const COMMON_WORDS_BLACKLIST = new Set([
|
||
'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
||
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could',
|
||
'may', 'might', 'must', 'can', 'of', 'in', 'on', 'at', 'to', 'for', 'with',
|
||
'by', 'from', 'as', 'that', 'this', 'these', 'those', 'it', 'its',
|
||
'and', 'or', 'but', 'if', 'so', 'not', 'no', 'yes', 'all', 'any', 'some',
|
||
'i', 'you', 'he', 'she', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
|
||
'my', 'your', 'his', 'her', 'our', 'their', 'one', 'two', 'three',
|
||
'first', 'second', 'more', 'most', 'other', 'such', 'only', 'own', 'same',
|
||
'up', 'out', 'over', 'under', 'above', 'below', 'between', 'through',
|
||
'during', 'before', 'after', 'since', 'until', 'while', 'about', 'than',
|
||
'also', 'very', 'just', 'here', 'there', 'where', 'when', 'why', 'how',
|
||
'what', 'which', 'who', 'whom', 'whose', 'each', 'every', 'both', 'few',
|
||
'many', 'much', 'several', 'now', 'then', 'always', 'never', 'often',
|
||
'sometimes', 'usually', 'really', 'actually', 'basically', 'generally'
|
||
]);
|
||
|
||
// 常见通用短语黑名单(多词组合但不是专业术语)
|
||
const COMMON_PHRASES_BLACKLIST = new Set([
|
||
'with regard to', 'with respect to', 'in terms of', 'in relation to',
|
||
'with reference to', 'in connection with', 'in accordance with',
|
||
'as a result', 'as a result of', 'in addition to', 'in spite of',
|
||
'as well as', 'as long as', 'as soon as', 'in order to', 'in case of',
|
||
'by means of', 'on behalf of', 'because of', 'instead of', 'in front of',
|
||
'in place of', 'by way of', 'for the sake of', 'at the same time',
|
||
'in the meantime', 'in the end', 'at the end', 'in the beginning',
|
||
'for example', 'for instance', 'such as', 'and so on', 'and so forth',
|
||
'in other words', 'in fact', 'in practice', 'in theory', 'in general',
|
||
'in particular', 'in detail', 'in brief', 'in short', 'in conclusion',
|
||
'on the other hand', 'on the contrary', 'by the way', 'in the way',
|
||
'the situation', 'the case', 'the fact', 'the point', 'the problem',
|
||
'the question', 'the answer', 'the reason', 'the result', 'the purpose',
|
||
'at least', 'at most', 'at first', 'at last', 'at all', 'not at all',
|
||
'as usual', 'as follows', 'as mentioned', 'as noted', 'as shown'
|
||
]);
|
||
|
||
// 过滤逻辑
|
||
const MIN_TERM_LENGTH = 3; // 最小长度提高到 3
|
||
const filtered = matches.filter(m => {
|
||
const term = (m.term || '').toLowerCase().trim();
|
||
|
||
// 1. 过滤掉单词黑名单中的通用词
|
||
if (COMMON_WORDS_BLACKLIST.has(term)) {
|
||
return false;
|
||
}
|
||
|
||
// 2. 过滤掉短语黑名单中的通用短语
|
||
if (COMMON_PHRASES_BLACKLIST.has(term)) {
|
||
return false;
|
||
}
|
||
|
||
// 3. 保留长度 >= 3 的术语
|
||
if (term.length < MIN_TERM_LENGTH) {
|
||
// 除非是中文术语(中文术语可以更短)
|
||
if (!/[\u4e00-\u9fa5]/.test(term)) {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
// 4. 过滤掉纯数字
|
||
if (/^\d+$/.test(term)) {
|
||
return false;
|
||
}
|
||
|
||
// 5. 保留包含大写字母的术语(可能是缩写或专有名词,如 API, NASA)
|
||
if (/[A-Z]/.test(m.term)) {
|
||
return true;
|
||
}
|
||
|
||
// 6. 保留多个单词组成的短语(更可能是专业术语)
|
||
// 但需要检查是否包含过多常用词
|
||
if (/\s/.test(term)) {
|
||
const words = term.split(/\s+/);
|
||
const commonWordCount = words.filter(w => COMMON_WORDS_BLACKLIST.has(w)).length;
|
||
// 如果超过一半是常用词,则过滤掉
|
||
if (commonWordCount > words.length / 2) {
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
// 7. 保留包含连字符或特殊字符的术语(如 machine-learning, AI/ML)
|
||
if (/[-_\/]/.test(term)) {
|
||
return true;
|
||
}
|
||
|
||
return true;
|
||
});
|
||
|
||
// 计算术语的"重要性得分"并排序
|
||
const scored = filtered.map(m => {
|
||
let score = 0;
|
||
const term = m.term || '';
|
||
|
||
// 长度越长,得分越高(长术语通常更专业)
|
||
score += term.length * 2;
|
||
|
||
// 包含大写字母(缩写、专有名词)+20
|
||
if (/[A-Z]/.test(term)) {
|
||
score += 20;
|
||
}
|
||
|
||
// 多词短语 +30
|
||
if (/\s/.test(term)) {
|
||
score += 30;
|
||
}
|
||
|
||
// 包含特殊字符 +10
|
||
if (/[-_\/]/.test(term)) {
|
||
score += 10;
|
||
}
|
||
|
||
// 中文术语 +15
|
||
if (/[\u4e00-\u9fa5]/.test(term)) {
|
||
score += 15;
|
||
}
|
||
|
||
return { ...m, _score: score };
|
||
});
|
||
|
||
// 按得分降序排序
|
||
scored.sort((a, b) => b._score - a._score);
|
||
|
||
// 移除得分属性并返回
|
||
return scored.map(({ _score, ...rest }) => rest);
|
||
}
|
||
|
||
/**
|
||
* 构建注入系统提示词的术语指引段落。
|
||
* 仅对当前分块/表格生效,控制模型按指定译法统一翻译。
|
||
* @param {Array<{term:string, translation:string}>} matches
|
||
* @param {string} targetLangName - 目标语言展示名(可直接使用 settings 的 targetLanguage 值或用户自定义)
|
||
*/
|
||
function buildGlossaryInstruction(matches, targetLangName) {
|
||
if (!matches || matches.length === 0) return '';
|
||
|
||
// 获取数量限制配置(检查所有启用的术语库,取最小值)
|
||
let maxTerms = 50; // 默认值
|
||
if (typeof loadGlossarySets === 'function') {
|
||
const sets = loadGlossarySets();
|
||
const setIds = Object.keys(sets || {});
|
||
for (const id of setIds) {
|
||
const set = sets[id];
|
||
if (set && set.enabled && set.maxTermsInPrompt) {
|
||
maxTerms = Math.max(1, Math.min(500, set.maxTermsInPrompt));
|
||
break; // 使用第一个启用的术语库的配置
|
||
}
|
||
}
|
||
}
|
||
|
||
// 限制术语数量
|
||
const limitedMatches = matches.slice(0, maxTerms);
|
||
|
||
const items = limitedMatches.map(m => `${m.term}→${m.translation}`).join('、');
|
||
|
||
return `[术语参考] ${items}
|
||
|
||
注:以上为本段可能涉及的专业术语建议译法,仅供参考。请以自然流畅的翻译为主,适当参考术语建议即可。`;
|
||
}
|
||
|
||
// 挂载到 processModule
|
||
if (typeof processModule !== 'undefined') {
|
||
processModule.getGlossaryMatchesForText = getGlossaryMatchesForText;
|
||
processModule.buildGlossaryInstruction = buildGlossaryInstruction;
|
||
}
|