paper-burner/js/chatbot/core/smart-granularity-selector.js

312 lines
9.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// js/chatbot/core/smart-granularity-selector.js
// 智能粒度选择器 - 根据问题类型和意群特征自动选择最佳粒度
(function(window) {
'use strict';
/**
* 问题类型分类
* - overview: 概览性问题需要summary
* - specific: 具体细节问题需要digest或full
* - extraction: 信息提取问题需要full
* - analytical: 分析性问题需要digest
*/
const QUERY_PATTERNS = {
overview: [
/总结|概括|概述|简述|大意|主要内容|主题|讲.*什么|关于什么/,
/整体|全文|全部|所有|overall|summary|general/i,
/介绍|背景|目的|意义|作用/,
/有哪些|包括.*什么|涉及.*什么/
],
extraction: [
/具体|详细|准确|精确|原文|exact|specific|detail/i,
/数据|数值|数字|结果|table|figure|chart/i,
/步骤|流程|过程|方法|algorithm|procedure/i,
/公式|方程|equation|formula/,
/引用|citation|reference|出处/,
/代码|code|实现|implementation/
],
analytical: [
/分析|解释|说明|explain|analyze|why|how/i,
/原因|理由|依据|根据|原理|机制/,
/比较|对比|区别|差异|联系|关系|compare/i,
/优缺点|利弊|advantage|disadvantage/,
/影响|作用|效果|impact|effect/
]
};
/**
* 粒度选择规则
* - summary: 摘要80字- 用于快速浏览、索引匹配
* - digest: 精要1000字- 用于一般性分析、问答
* - full: 全文(完整文本)- 用于精确查找、详细分析
*/
const GRANULARITY_RULES = {
overview: {
default: 'summary',
maxGroups: 10, // 概览问题可以返回更多意群
description: '概览性查询:使用摘要快速扫描'
},
analytical: {
default: 'digest',
maxGroups: 5,
description: '分析性查询:使用精要提供足够细节'
},
extraction: {
default: 'full',
maxGroups: 3, // 精确查询限制意群数量,避免上下文过长
description: '提取性查询:使用全文确保信息完整'
},
specific: {
default: 'digest',
maxGroups: 5,
description: '具体性查询:使用精要平衡细节与长度'
}
};
/**
* 分析查询类型
* @param {string} query - 用户查询
* @returns {string} 查询类型 (overview/analytical/extraction/specific)
*/
function analyzeQueryType(query) {
const q = String(query || '').trim();
if (!q) return 'specific';
// 检查各类型模式
for (const [type, patterns] of Object.entries(QUERY_PATTERNS)) {
for (const pattern of patterns) {
if (pattern.test(q)) {
return type;
}
}
}
// 默认使用specific类型
return 'specific';
}
/**
* 根据意群特征调整粒度
* @param {Object} group - 意群对象
* @param {string} baseGranularity - 基础粒度
* @returns {string} 调整后的粒度
*/
function adjustByGroupFeatures(group, baseGranularity) {
if (!group) return baseGranularity;
const charCount = group.charCount || 0;
const hasDigest = !!(group.digest && group.digest.length > 100);
const hasFull = !!(group.fullText && group.fullText.length > 500);
// 如果意群本身很短(<2000字直接使用full
if (charCount < 2000 && hasFull) {
return 'full';
}
// 如果没有digest降级到summary或升级到full
if (!hasDigest) {
if (baseGranularity === 'digest') {
return hasFull ? 'full' : 'summary';
}
}
// 如果没有fulldigest是最高粒度
if (!hasFull && baseGranularity === 'full') {
return hasDigest ? 'digest' : 'summary';
}
return baseGranularity;
}
/**
* 智能选择粒度
* @param {string} query - 用户查询
* @param {Array} groups - 候选意群列表
* @param {Object} options - 选项
* @returns {Object} { granularity, maxGroups, queryType, reasoning }
*/
function selectGranularity(query, groups = [], options = {}) {
// 分析查询类型
const queryType = analyzeQueryType(query);
const rule = GRANULARITY_RULES[queryType] || GRANULARITY_RULES.specific;
let granularity = options.forceGranularity || rule.default;
let maxGroups = options.maxGroups || rule.maxGroups;
// 如果候选意群少,可以使用更高粒度
if (groups.length <= 2 && granularity === 'summary') {
granularity = 'digest';
} else if (groups.length === 1 && granularity !== 'full') {
granularity = 'full';
}
// Token限制检查
const estimatedTokens = estimateTokenUsage(groups.slice(0, maxGroups), granularity);
if (options.maxTokens && estimatedTokens > options.maxTokens) {
// Token超限降级粒度或减少意群数
if (granularity === 'full') {
granularity = 'digest';
} else if (granularity === 'digest' && estimatedTokens > options.maxTokens * 1.5) {
granularity = 'summary';
} else {
// 减少意群数量
maxGroups = Math.max(1, Math.floor(maxGroups * 0.6));
}
}
return {
granularity,
maxGroups,
queryType,
reasoning: rule.description,
estimatedTokens: estimateTokenUsage(groups.slice(0, maxGroups), granularity)
};
}
/**
* 估算Token使用量
* @param {Array} groups - 意群列表
* @param {string} granularity - 粒度
* @returns {number} 估算的token数
*/
function estimateTokenUsage(groups, granularity) {
if (!Array.isArray(groups) || groups.length === 0) return 0;
let totalChars = 0;
groups.forEach(g => {
if (granularity === 'summary') {
totalChars += (g.summary || '').length;
} else if (granularity === 'digest') {
totalChars += (g.digest || '').length;
} else if (granularity === 'full') {
totalChars += g.charCount || (g.fullText || '').length;
}
});
// 中文平均1.5字符=1token英文平均4字符=1token
// 这里简化为平均2字符=1token
return Math.ceil(totalChars / 2);
}
/**
* 批量选择意群的粒度(支持混合粒度)
* @param {string} query - 用户查询
* @param {Array} rankedGroups - 已排序的候选意群(相关性从高到低)
* @param {Object} options - 选项
* @returns {Array} [ { group, granularity, score } ]
*/
function selectMixedGranularity(query, rankedGroups = [], options = {}) {
const queryType = analyzeQueryType(query);
const baseRule = GRANULARITY_RULES[queryType] || GRANULARITY_RULES.specific;
const maxTokens = options.maxTokens || 8000;
const result = [];
let accumulatedTokens = 0;
for (let i = 0; i < rankedGroups.length; i++) {
const item = rankedGroups[i];
const group = item.group || item;
const score = item.score || 1.0;
// 排名越靠前,使用越高粒度
let granularity;
if (i === 0) {
// 最相关的意群:使用最高粒度
granularity = queryType === 'overview' ? 'digest' : 'full';
} else if (i < 3) {
// 前3个意群使用中等粒度
granularity = baseRule.default;
} else {
// 其他意群:使用低粒度
granularity = 'summary';
}
// 根据意群特征调整
granularity = adjustByGroupFeatures(group, granularity);
// 估算token
const tokens = estimateTokenUsage([group], granularity);
// 检查是否会超限
if (accumulatedTokens + tokens > maxTokens) {
// 尝试降级
if (granularity === 'full') {
granularity = 'digest';
} else if (granularity === 'digest') {
granularity = 'summary';
} else {
// 已经是summary无法继续添加
break;
}
}
const finalTokens = estimateTokenUsage([group], granularity);
if (accumulatedTokens + finalTokens > maxTokens) {
break; // 无法添加更多意群
}
accumulatedTokens += finalTokens;
result.push({
group,
granularity,
score,
tokens: finalTokens
});
// 检查是否已有足够的意群
if (result.length >= (baseRule.maxGroups || 5)) {
break;
}
}
return result;
}
/**
* 构建混合粒度上下文
* @param {Array} selections - selectMixedGranularity的结果
* @returns {string} 组合后的上下文文本
*/
function buildMixedContext(selections) {
if (!Array.isArray(selections) || selections.length === 0) return '';
const parts = [];
selections.forEach(sel => {
const g = sel.group;
const gran = sel.granularity;
let text = '';
if (gran === 'summary') {
text = g.summary || '';
} else if (gran === 'digest') {
text = g.digest || g.summary || '';
} else if (gran === 'full') {
text = g.fullText || g.digest || g.summary || '';
}
if (text) {
const keywords = (g.keywords || []).join('、');
parts.push(`${g.groupId} - ${gran}\n关键词: ${keywords}\n内容:\n${text}`);
}
});
return parts.join('\n\n');
}
// 导出
window.SmartGranularitySelector = {
analyzeQueryType,
selectGranularity,
selectMixedGranularity,
buildMixedContext,
estimateTokenUsage,
adjustByGroupFeatures,
// 暴露规则以便测试和调整
GRANULARITY_RULES,
QUERY_PATTERNS
};
console.log('[SmartGranularitySelector] 智能粒度选择器已加载');
})(window);