// js/processing/reference-ai-processor.js // 参考文献AI批量处理器 - 使用AI提取文献元数据 (function(global) { 'use strict'; /** * 批量大小(每批处理的文献数量) */ const BATCH_SIZE = 10; /** * 生成AI提示词 - 用于提取文献信息(简化版,让AI自己决定字段) */ function generateExtractionPrompt(references, sourceLang = 'auto') { const langHint = sourceLang !== 'auto' ? `注意:文献可能是${sourceLang}语言。` : ''; // 构建 JSON 输入格式,让 AI 更容易对应每条文献 const inputJson = references.map((ref, idx) => ({ id: idx, raw: ref })); return { system: `你是专业的文献信息提取助手。从参考文献中提取结构化信息,返回JSON格式。 输入格式: [ {"id": 0, "raw": "文献原始文本"}, {"id": 1, "raw": "文献原始文本"} ] 返回格式: { "references": [ { "id": 0, "authors": ["作者列表"], "title": "标题", "year": 2023, "journal": "期刊", "doi": "DOI", "url": "链接" } ] } 提取规则: - 提取所有能识别的字段(authors, title, year, journal, volume, issue, pages, doi, url等) - 无法提取的字段设为null - 保持原文格式 - ${langHint} - 只返回JSON,不要任何其他文字 ⚠️ 严格要求: - 必须返回 ${references.length} 条文献,每条文献的 id 必须与输入一一对应(0 到 ${references.length - 1}) - 不要遗漏任何一条(检查 id 是否连续) - 不要添加额外的文献 - 不要编造不存在的信息 - 保持 id 顺序与输入完全一致`, user: JSON.stringify(inputJson, null, 2) }; } /** * 调用AI API提取文献信息(带指数退避重试) */ async function callAIExtraction(references, apiConfig, sourceLang = 'auto') { const maxRetries = 3; const baseDelay = 1000; const maxDelay = 8000; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { const prompt = generateExtractionPrompt(references, sourceLang); const requestBody = apiConfig.bodyBuilder ? apiConfig.bodyBuilder(prompt.system, prompt.user) : { model: apiConfig.modelName, messages: [ { role: "system", content: prompt.system }, { role: "user", content: prompt.user } ], temperature: 0.1 }; if (attempt === 0) { console.log('[ReferenceAIProcessor] 请求详情:', { endpoint: apiConfig.endpoint, model: apiConfig.modelName, hasApiKey: !!apiConfig.apiKey, headers: apiConfig.headers, bodyPreview: { model: requestBody.model, messagesCount: requestBody.messages?.length, temperature: requestBody.temperature } }); } const headers = apiConfig.headers || {}; const response = await fetch(apiConfig.endpoint, { method: 'POST', headers: headers, body: JSON.stringify(requestBody) }); console.log(`[ReferenceAIProcessor] 响应状态 (尝试 ${attempt + 1}/${maxRetries + 1}):`, response.status, response.statusText); if (!response.ok) { const errorText = await response.text(); const isRetriable = [401, 403, 408, 429, 500, 502, 503, 504].includes(response.status); console.error('[ReferenceAIProcessor] API错误响应:', { status: response.status, statusText: response.statusText, preview: errorText.substring(0, 500), retriable: isRetriable }); if (isRetriable && attempt < maxRetries) { const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); console.log(`[ReferenceAIProcessor] 等待 ${Math.round(delay)}ms 后重试...`); await new Promise(resolve => setTimeout(resolve, delay)); continue; } throw new Error(`API请求失败 (${response.status}): ${response.statusText}`); } const responseText = await response.text(); if (attempt === 0) { console.log('[ReferenceAIProcessor] 原始响应前500字符:', responseText.substring(0, 500)); } // 检查是否是HTML响应 if (responseText.trim().toLowerCase().startsWith(' 5,拆分成更小的子批次重试 if (references.length > 5) { console.warn(`[ReferenceAIProcessor] 批次太大 (${references.length} 条),拆分为 2 个子批次重试...`); const mid = Math.ceil(references.length / 2); const batch1 = references.slice(0, mid); const batch2 = references.slice(mid); // 递归调用,分别处理两个子批次 const [results1, results2] = await Promise.all([ callAIExtraction(batch1, apiConfig, sourceLang), callAIExtraction(batch2, apiConfig, sourceLang) ]); // 合并结果(保持 id 顺序) return [...results1, ...results2]; } else if (attempt < maxRetries) { // 批次已经很小,延迟后重试 const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); console.warn(`[ReferenceAIProcessor] 批次较小 (${references.length} 条),等待 ${Math.round(delay)}ms 后重试...`); await new Promise(resolve => setTimeout(resolve, delay)); continue; } else { throw new Error(`响应被截断(finish_reason: length),批次大小已最小 (${references.length} 条),无法继续拆分`); } } const extractedText = apiConfig.responseExtractor ? apiConfig.responseExtractor(data) : data?.choices?.[0]?.message?.content; if (!extractedText) { console.error('[ReferenceAIProcessor] 响应数据:', data); if (attempt < maxRetries) { console.warn(`[ReferenceAIProcessor] 内容为空,尝试 ${attempt + 1}/${maxRetries + 1},将重试...`); const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); await new Promise(resolve => setTimeout(resolve, delay)); continue; } throw new Error('API返回的内容为空'); } // 清理可能的markdown代码块标记 let cleanText = extractedText.trim(); if (cleanText.startsWith('```json')) { cleanText = cleanText.replace(/^```json\s*/, '').replace(/```\s*$/, ''); } else if (cleanText.startsWith('```')) { cleanText = cleanText.replace(/^```\s*/, '').replace(/```\s*$/, ''); } // 解析JSON响应 try { const parsed = JSON.parse(cleanText); const extractedRefs = parsed.references || []; // 验证1:数量必须匹配 if (extractedRefs.length !== references.length) { console.error(`[ReferenceAIProcessor] 数量不匹配: 输入 ${references.length} 条,AI 返回 ${extractedRefs.length} 条`); console.error('[ReferenceAIProcessor] 输入文献:', references.map((r, i) => `[${i}] ${r.substring(0, 100)}...`)); console.error('[ReferenceAIProcessor] AI 返回:', extractedRefs.map((r, i) => `[${i}] id=${r.id} ${r.title?.substring(0, 100)}`)); if (attempt < maxRetries) { console.warn(`[ReferenceAIProcessor] 数量不匹配(可能幻觉),尝试 ${attempt + 1}/${maxRetries + 1},将重试...`); const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); await new Promise(resolve => setTimeout(resolve, delay)); continue; } throw new Error(`AI 返回数量错误:期望 ${references.length} 条,实际 ${extractedRefs.length} 条(可能产生幻觉)`); } // 验证2:检查 id 是否连续且完整(0 到 N-1) const ids = extractedRefs.map(r => r.id).sort((a, b) => a - b); const expectedIds = Array.from({ length: references.length }, (_, i) => i); const missingIds = expectedIds.filter(id => !ids.includes(id)); const extraIds = ids.filter(id => !expectedIds.includes(id)); if (missingIds.length > 0 || extraIds.length > 0) { console.error(`[ReferenceAIProcessor] ID 不匹配:`); if (missingIds.length > 0) { console.error(` 缺失 ID: ${missingIds.join(', ')}`); } if (extraIds.length > 0) { console.error(` 额外 ID: ${extraIds.join(', ')}`); } console.error('[ReferenceAIProcessor] AI 返回的 ID:', ids.join(', ')); console.error('[ReferenceAIProcessor] 期望的 ID:', expectedIds.join(', ')); if (attempt < maxRetries) { console.warn(`[ReferenceAIProcessor] ID 不连续(可能幻觉或遗漏),尝试 ${attempt + 1}/${maxRetries + 1},将重试...`); const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); await new Promise(resolve => setTimeout(resolve, delay)); continue; } throw new Error(`AI 返回的 ID 不连续:缺失 [${missingIds.join(', ')}],额外 [${extraIds.join(', ')}]`); } // 验证3:按 id 排序确保顺序正确 extractedRefs.sort((a, b) => a.id - b.id); console.log(`[ReferenceAIProcessor] 成功提取 ${extractedRefs.length} 条文献(验证通过:数量✓ ID连续✓)`); return extractedRefs; } catch (parseError) { console.error('[ReferenceAIProcessor] JSON解析失败:', parseError); console.error('[ReferenceAIProcessor] 原始内容:', cleanText.substring(0, 1000)); if (attempt < maxRetries) { console.warn(`[ReferenceAIProcessor] JSON格式错误,尝试 ${attempt + 1}/${maxRetries + 1},将重试...`); const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); await new Promise(resolve => setTimeout(resolve, delay)); continue; } throw parseError; } } catch (error) { // 网络错误也重试 if (attempt < maxRetries && (error.name === 'TypeError' || error.message.includes('fetch'))) { const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay); console.error(`[ReferenceAIProcessor] 网络错误,尝试 ${attempt + 1}/${maxRetries + 1},等待 ${Math.round(delay)}ms 后重试:`, error.message); await new Promise(resolve => setTimeout(resolve, delay)); continue; } console.error(`[ReferenceAIProcessor] AI extraction failed after ${attempt + 1} attempts:`, error); throw error; } } throw new Error(`AI提取失败:已尝试 ${maxRetries + 1} 次`); } /** * 批量处理文献(分批并发) * @param {Array} references - 文献条目数组(原始文本) * @param {Object} apiConfig - API配置 * @param {string} sourceLang - 源语言 * @param {Function} progressCallback - 进度回调 * @returns {Promise} 处理结果 */ async function batchProcessReferences(references, apiConfig, sourceLang = 'auto', progressCallback = null) { if (!references || !Array.isArray(references) || references.length === 0) { return []; } // 分批 const batches = []; for (let i = 0; i < references.length; i += BATCH_SIZE) { batches.push(references.slice(i, i + BATCH_SIZE)); } console.log(`[ReferenceAIProcessor] Processing ${references.length} references in ${batches.length} batches (${BATCH_SIZE} per batch)`); const results = []; let processedCount = 0; // 并发处理所有批次 const batchPromises = batches.map(async (batch, batchIndex) => { try { const batchResults = await callAIExtraction(batch, apiConfig, sourceLang); // 更新进度 processedCount += batch.length; if (progressCallback) { progressCallback({ processed: processedCount, total: references.length, batchIndex: batchIndex, totalBatches: batches.length }); } return batchResults; } catch (error) { console.error(`[ReferenceAIProcessor] Batch ${batchIndex} failed:`, error); // 失败时返回原始数据 return batch.map((ref, idx) => ({ index: batchIndex * BATCH_SIZE + idx, rawText: ref, extractedBy: 'fallback', error: error.message })); } }); // 等待所有批次完成 const batchResults = await Promise.all(batchPromises); // 合并结果 batchResults.forEach(batchResult => { results.push(...batchResult); }); return results; } /** * 智能处理文献(自动选择正则或AI) * @param {Array} entries - 文献条目(已经过正则提取) * @param {Object} apiConfig - API配置 * @param {string} sourceLang - 源语言 * @param {Function} progressCallback - 进度回调 * @param {Object} options - 额外选项 {enrichWithDOI: boolean} * @returns {Promise} 处理结果 */ async function smartProcessReferences(entries, apiConfig, sourceLang = 'auto', progressCallback = null, options = {}) { if (!entries || !Array.isArray(entries)) { return []; } // 分类:需要AI处理 vs 已成功提取 const needsAI = entries.filter(e => e.needsAIProcessing); const alreadyExtracted = entries.filter(e => !e.needsAIProcessing); console.log(`[ReferenceAIProcessor] ${alreadyExtracted.length} references extracted by regex, ${needsAI.length} need AI processing`); if (needsAI.length === 0) { return entries.map(e => ({ ...e, extractedBy: 'regex' })); } // AI处理需要处理的文献 const aiResults = await batchProcessReferences( needsAI.map(e => e.rawText), apiConfig, sourceLang, progressCallback ); // 合并结果 let finalResults = [...alreadyExtracted]; aiResults.forEach((aiResult, idx) => { const original = needsAI[idx]; finalResults.push({ ...original, ...aiResult, extractedBy: 'ai', confidence: aiResult.error ? 0 : 0.8 // AI提取的置信度 }); }); // 按原始索引排序 finalResults.sort((a, b) => (a.index || 0) - (b.index || 0)); // 可选:使用DOI解析器补充DOI信息 if (options.enrichWithDOI && typeof window.DOIResolver !== 'undefined') { console.log('[ReferenceAIProcessor] Enriching with DOI information...'); finalResults = await enrichWithDOI(finalResults, progressCallback); } return finalResults; } /** * 使用DOI解析器补充文献的DOI信息 * @param {Array} references - 文献列表 * @param {Function} progressCallback - 进度回调 * @returns {Promise} 补充后的文献列表 */ async function enrichWithDOI(references, progressCallback = null) { if (!window.DOIResolver) { console.warn('[ReferenceAIProcessor] DOIResolver not available, skipping DOI enrichment'); return references; } // 筛选出需要查询DOI的文献(没有DOI或DOI不完整) const needsDOI = references.filter(ref => !ref.doi && ref.title); if (needsDOI.length === 0) { console.log('[ReferenceAIProcessor] All references already have DOI'); return references; } console.log(`[ReferenceAIProcessor] Querying DOI for ${needsDOI.length} references`); // 创建DOI解析器 const resolver = window.DOIResolver.create({ queryOrder: ['crossref', 'openalex', 'pubmed'], timeout: 5000 }); // 批量解析 const doiResults = await resolver.batchResolve(needsDOI, (progress) => { if (progressCallback) { progressCallback({ phase: 'doi-enrichment', completed: progress.completed, total: progress.total, current: progress.current }); } }); // 合并DOI信息回原始文献列表 const enrichedReferences = references.map(ref => { if (ref.doi) return ref; // 已有DOI,跳过 const doiResult = doiResults.find(r => r.original === ref); if (doiResult && doiResult.resolved) { return { ...ref, doi: doiResult.resolved.doi, url: doiResult.resolved.url || ref.url, // 可选:用DOI查询结果补充缺失的字段 authors: ref.authors || doiResult.resolved.authors, year: ref.year || doiResult.resolved.year, journal: ref.journal || doiResult.resolved.journal, doiSource: doiResult.resolved.source, doiConfidence: doiResult.resolved.confidence }; } return ref; }); const successCount = enrichedReferences.filter(r => r.doi).length; console.log(`[ReferenceAIProcessor] DOI enrichment complete: ${successCount}/${references.length} now have DOI`); return enrichedReferences; } /** * 构建API配置(兼容现有的翻译API) */ function buildAPIConfig(model, apiKey, modelConfig = null) { if (model === 'custom' && modelConfig) { const endpoint = modelConfig.apiEndpoint || modelConfig.apiBaseUrl; return { endpoint: endpoint, modelName: modelConfig.modelId, apiKey: apiKey, headers: { 'Content-Type': 'application/json' }, bodyBuilder: (sys, user) => { const messages = [ { role: 'system', content: sys }, { role: 'user', content: user } ]; return { model: modelConfig.modelId, messages: messages, temperature: 0.1, max_tokens: modelConfig.max_tokens || 4000 }; }, responseExtractor: (data) => data?.choices?.[0]?.message?.content }; } // 预设模型配置 const configs = { 'mistral': { endpoint: 'https://api.mistral.ai/v1/chat/completions', modelName: 'mistral-large-latest' }, 'deepseek': { endpoint: 'https://api.deepseek.com/v1/chat/completions', modelName: 'deepseek-chat' }, 'gemini': { endpoint: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent', modelName: 'gemini-2.0-flash', bodyBuilder: (sys, user) => ({ contents: [{ role: 'user', parts: [{ text: `${sys}\n\n${user}` }] }], generationConfig: { temperature: 0.1, maxOutputTokens: 4000 } }), responseExtractor: (data) => data?.candidates?.[0]?.content?.parts?.[0]?.text } }; const config = configs[model]; if (!config) { throw new Error(`Unsupported model: ${model}`); } return { ...config, apiKey: apiKey, headers: { 'Content-Type': 'application/json' }, bodyBuilder: config.bodyBuilder || ((sys, user) => ({ model: config.modelName, messages: [ { role: 'system', content: sys }, { role: 'user', content: user } ], temperature: 0.1 })), responseExtractor: config.responseExtractor || ((data) => data?.choices?.[0]?.message?.content) }; } // 导出API global.ReferenceAIProcessor = { batchProcessReferences, smartProcessReferences, enrichWithDOI, generateExtractionPrompt, buildAPIConfig, BATCH_SIZE, version: '1.1.0' }; console.log('[ReferenceAIProcessor] Reference AI processor loaded.'); })(window);