paper-burner/js/processing/reference-ai-processor.js

// js/processing/reference-ai-processor.js
// 参考文献AI批量处理器 - 使用AI提取文献元数据

(function(global) {
    'use strict';

    /**
     * 批量大小（每批处理的文献数量）
     */
    const BATCH_SIZE = 10;

    /**
     * 生成AI提示词 - 用于提取文献信息（简化版，让AI自己决定字段）
     */
    function generateExtractionPrompt(references, sourceLang = 'auto') {
        const langHint = sourceLang !== 'auto' ? `注意：文献可能是${sourceLang}语言。` : '';

        // 构建 JSON 输入格式，让 AI 更容易对应每条文献
        const inputJson = references.map((ref, idx) => ({
            id: idx,
            raw: ref
        }));

        return {
            system: `你是专业的文献信息提取助手。从参考文献中提取结构化信息，返回JSON格式。

输入格式：
[
  {"id": 0, "raw": "文献原始文本"},
  {"id": 1, "raw": "文献原始文本"}
]

返回格式：
{
  "references": [
    {
      "id": 0,
      "authors": ["作者列表"],
      "title": "标题",
      "year": 2023,
      "journal": "期刊",
      "doi": "DOI",
      "url": "链接"
    }
  ]
}

提取规则：
- 提取所有能识别的字段（authors, title, year, journal, volume, issue, pages, doi, url等）
- 无法提取的字段设为null
- 保持原文格式
- ${langHint}
- 只返回JSON，不要任何其他文字

⚠️ 严格要求：
- 必须返回 ${references.length} 条文献，每条文献的 id 必须与输入一一对应（0 到 ${references.length - 1}）
- 不要遗漏任何一条（检查 id 是否连续）
- 不要添加额外的文献
- 不要编造不存在的信息
- 保持 id 顺序与输入完全一致`,

            user: JSON.stringify(inputJson, null, 2)
        };
    }

    /**
     * 调用AI API提取文献信息（带指数退避重试）
     */
    async function callAIExtraction(references, apiConfig, sourceLang = 'auto') {
        const maxRetries = 3;
        const baseDelay = 1000;
        const maxDelay = 8000;

        for (let attempt = 0; attempt <= maxRetries; attempt++) {
            try {
                const prompt = generateExtractionPrompt(references, sourceLang);
                const requestBody = apiConfig.bodyBuilder
                    ? apiConfig.bodyBuilder(prompt.system, prompt.user)
                    : {
                        model: apiConfig.modelName,
                        messages: [
                            { role: "system", content: prompt.system },
                            { role: "user", content: prompt.user }
                        ],
                        temperature: 0.1
                    };

                if (attempt === 0) {
                    console.log('[ReferenceAIProcessor] 请求详情:', {
                        endpoint: apiConfig.endpoint,
                        model: apiConfig.modelName,
                        hasApiKey: !!apiConfig.apiKey,
                        headers: apiConfig.headers,
                        bodyPreview: {
                            model: requestBody.model,
                            messagesCount: requestBody.messages?.length,
                            temperature: requestBody.temperature
                        }
                    });
                }

                const headers = apiConfig.headers || {};

                const response = await fetch(apiConfig.endpoint, {
                    method: 'POST',
                    headers: headers,
                    body: JSON.stringify(requestBody)
                });

                console.log(`[ReferenceAIProcessor] 响应状态 (尝试 ${attempt + 1}/${maxRetries + 1}):`, response.status, response.statusText);

                if (!response.ok) {
                    const errorText = await response.text();
                    const isRetriable = [401, 403, 408, 429, 500, 502, 503, 504].includes(response.status);

                    console.error('[ReferenceAIProcessor] API错误响应:', {
                        status: response.status,
                        statusText: response.statusText,
                        preview: errorText.substring(0, 500),
                        retriable: isRetriable
                    });

                    if (isRetriable && attempt < maxRetries) {
                        const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                        console.log(`[ReferenceAIProcessor] 等待 ${Math.round(delay)}ms 后重试...`);
                        await new Promise(resolve => setTimeout(resolve, delay));
                        continue;
                    }

                    throw new Error(`API请求失败 (${response.status}): ${response.statusText}`);
                }

                const responseText = await response.text();
                if (attempt === 0) {
                    console.log('[ReferenceAIProcessor] 原始响应前500字符:', responseText.substring(0, 500));
                }

                // 检查是否是HTML响应
                if (responseText.trim().toLowerCase().startsWith('<!doctype') ||
                    responseText.trim().toLowerCase().startsWith('<html')) {
                    console.error('[ReferenceAIProcessor] API返回了HTML页面而不是JSON');
                    throw new Error('API返回HTML而非JSON，请检查端点配置和API Key');
                }

                const data = JSON.parse(responseText);

                // 检查是否因长度限制被截断
                const finishReason = data?.choices?.[0]?.finish_reason;
                if (finishReason === 'length') {
                    console.warn(`[ReferenceAIProcessor] 响应被截断 (finish_reason: length)，尝试 ${attempt + 1}/${maxRetries + 1}`);

                    // 如果批次大小 > 5，拆分成更小的子批次重试
                    if (references.length > 5) {
                        console.warn(`[ReferenceAIProcessor] 批次太大 (${references.length} 条)，拆分为 2 个子批次重试...`);
                        const mid = Math.ceil(references.length / 2);
                        const batch1 = references.slice(0, mid);
                        const batch2 = references.slice(mid);

                        // 递归调用，分别处理两个子批次
                        const [results1, results2] = await Promise.all([
                            callAIExtraction(batch1, apiConfig, sourceLang),
                            callAIExtraction(batch2, apiConfig, sourceLang)
                        ]);

                        // 合并结果（保持 id 顺序）
                        return [...results1, ...results2];
                    } else if (attempt < maxRetries) {
                        // 批次已经很小，延迟后重试
                        const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                        console.warn(`[ReferenceAIProcessor] 批次较小 (${references.length} 条)，等待 ${Math.round(delay)}ms 后重试...`);
                        await new Promise(resolve => setTimeout(resolve, delay));
                        continue;
                    } else {
                        throw new Error(`响应被截断（finish_reason: length），批次大小已最小 (${references.length} 条)，无法继续拆分`);
                    }
                }

                const extractedText = apiConfig.responseExtractor
                    ? apiConfig.responseExtractor(data)
                    : data?.choices?.[0]?.message?.content;

                if (!extractedText) {
                    console.error('[ReferenceAIProcessor] 响应数据:', data);

                    if (attempt < maxRetries) {
                        console.warn(`[ReferenceAIProcessor] 内容为空，尝试 ${attempt + 1}/${maxRetries + 1}，将重试...`);
                        const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                        await new Promise(resolve => setTimeout(resolve, delay));
                        continue;
                    }

                    throw new Error('API返回的内容为空');
                }

                // 清理可能的markdown代码块标记
                let cleanText = extractedText.trim();
                if (cleanText.startsWith('```json')) {
                    cleanText = cleanText.replace(/^```json\s*/, '').replace(/```\s*$/, '');
                } else if (cleanText.startsWith('```')) {
                    cleanText = cleanText.replace(/^```\s*/, '').replace(/```\s*$/, '');
                }

                // 解析JSON响应
                try {
                    const parsed = JSON.parse(cleanText);
                    const extractedRefs = parsed.references || [];

                    // 验证1：数量必须匹配
                    if (extractedRefs.length !== references.length) {
                        console.error(`[ReferenceAIProcessor] 数量不匹配: 输入 ${references.length} 条，AI 返回 ${extractedRefs.length} 条`);
                        console.error('[ReferenceAIProcessor] 输入文献:', references.map((r, i) => `[${i}] ${r.substring(0, 100)}...`));
                        console.error('[ReferenceAIProcessor] AI 返回:', extractedRefs.map((r, i) => `[${i}] id=${r.id} ${r.title?.substring(0, 100)}`));

                        if (attempt < maxRetries) {
                            console.warn(`[ReferenceAIProcessor] 数量不匹配（可能幻觉），尝试 ${attempt + 1}/${maxRetries + 1}，将重试...`);
                            const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                            await new Promise(resolve => setTimeout(resolve, delay));
                            continue;
                        }

                        throw new Error(`AI 返回数量错误：期望 ${references.length} 条，实际 ${extractedRefs.length} 条（可能产生幻觉）`);
                    }

                    // 验证2：检查 id 是否连续且完整（0 到 N-1）
                    const ids = extractedRefs.map(r => r.id).sort((a, b) => a - b);
                    const expectedIds = Array.from({ length: references.length }, (_, i) => i);
                    const missingIds = expectedIds.filter(id => !ids.includes(id));
                    const extraIds = ids.filter(id => !expectedIds.includes(id));

                    if (missingIds.length > 0 || extraIds.length > 0) {
                        console.error(`[ReferenceAIProcessor] ID 不匹配:`);
                        if (missingIds.length > 0) {
                            console.error(`  缺失 ID: ${missingIds.join(', ')}`);
                        }
                        if (extraIds.length > 0) {
                            console.error(`  额外 ID: ${extraIds.join(', ')}`);
                        }
                        console.error('[ReferenceAIProcessor] AI 返回的 ID:', ids.join(', '));
                        console.error('[ReferenceAIProcessor] 期望的 ID:', expectedIds.join(', '));

                        if (attempt < maxRetries) {
                            console.warn(`[ReferenceAIProcessor] ID 不连续（可能幻觉或遗漏），尝试 ${attempt + 1}/${maxRetries + 1}，将重试...`);
                            const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                            await new Promise(resolve => setTimeout(resolve, delay));
                            continue;
                        }

                        throw new Error(`AI 返回的 ID 不连续：缺失 [${missingIds.join(', ')}]，额外 [${extraIds.join(', ')}]`);
                    }

                    // 验证3：按 id 排序确保顺序正确
                    extractedRefs.sort((a, b) => a.id - b.id);

                    console.log(`[ReferenceAIProcessor] 成功提取 ${extractedRefs.length} 条文献（验证通过：数量✓ ID连续✓）`);
                    return extractedRefs;
                } catch (parseError) {
                    console.error('[ReferenceAIProcessor] JSON解析失败:', parseError);
                    console.error('[ReferenceAIProcessor] 原始内容:', cleanText.substring(0, 1000));

                    if (attempt < maxRetries) {
                        console.warn(`[ReferenceAIProcessor] JSON格式错误，尝试 ${attempt + 1}/${maxRetries + 1}，将重试...`);
                        const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                        await new Promise(resolve => setTimeout(resolve, delay));
                        continue;
                    }

                    throw parseError;
                }

            } catch (error) {
                // 网络错误也重试
                if (attempt < maxRetries && (error.name === 'TypeError' || error.message.includes('fetch'))) {
                    const delay = Math.min(baseDelay * Math.pow(2, attempt) + Math.random() * 1000, maxDelay);
                    console.error(`[ReferenceAIProcessor] 网络错误，尝试 ${attempt + 1}/${maxRetries + 1}，等待 ${Math.round(delay)}ms 后重试:`, error.message);
                    await new Promise(resolve => setTimeout(resolve, delay));
                    continue;
                }

                console.error(`[ReferenceAIProcessor] AI extraction failed after ${attempt + 1} attempts:`, error);
                throw error;
            }
        }

        throw new Error(`AI提取失败：已尝试 ${maxRetries + 1} 次`);
    }

    /**
     * 批量处理文献（分批并发）
     * @param {Array} references - 文献条目数组（原始文本）
     * @param {Object} apiConfig - API配置
     * @param {string} sourceLang - 源语言
     * @param {Function} progressCallback - 进度回调
     * @returns {Promise<Array>} 处理结果
     */
    async function batchProcessReferences(references, apiConfig, sourceLang = 'auto', progressCallback = null) {
        if (!references || !Array.isArray(references) || references.length === 0) {
            return [];
        }

        // 分批
        const batches = [];
        for (let i = 0; i < references.length; i += BATCH_SIZE) {
            batches.push(references.slice(i, i + BATCH_SIZE));
        }

        console.log(`[ReferenceAIProcessor] Processing ${references.length} references in ${batches.length} batches (${BATCH_SIZE} per batch)`);

        const results = [];
        let processedCount = 0;

        // 并发处理所有批次
        const batchPromises = batches.map(async (batch, batchIndex) => {
            try {
                const batchResults = await callAIExtraction(batch, apiConfig, sourceLang);

                // 更新进度
                processedCount += batch.length;
                if (progressCallback) {
                    progressCallback({
                        processed: processedCount,
                        total: references.length,
                        batchIndex: batchIndex,
                        totalBatches: batches.length
                    });
                }

                return batchResults;
            } catch (error) {
                console.error(`[ReferenceAIProcessor] Batch ${batchIndex} failed:`, error);

                // 失败时返回原始数据
                return batch.map((ref, idx) => ({
                    index: batchIndex * BATCH_SIZE + idx,
                    rawText: ref,
                    extractedBy: 'fallback',
                    error: error.message
                }));
            }
        });

        // 等待所有批次完成
        const batchResults = await Promise.all(batchPromises);

        // 合并结果
        batchResults.forEach(batchResult => {
            results.push(...batchResult);
        });

        return results;
    }

    /**
     * 智能处理文献（自动选择正则或AI）
     * @param {Array} entries - 文献条目（已经过正则提取）
     * @param {Object} apiConfig - API配置
     * @param {string} sourceLang - 源语言
     * @param {Function} progressCallback - 进度回调
     * @param {Object} options - 额外选项 {enrichWithDOI: boolean}
     * @returns {Promise<Array>} 处理结果
     */
    async function smartProcessReferences(entries, apiConfig, sourceLang = 'auto', progressCallback = null, options = {}) {
        if (!entries || !Array.isArray(entries)) {
            return [];
        }

        // 分类：需要AI处理 vs 已成功提取
        const needsAI = entries.filter(e => e.needsAIProcessing);
        const alreadyExtracted = entries.filter(e => !e.needsAIProcessing);

        console.log(`[ReferenceAIProcessor] ${alreadyExtracted.length} references extracted by regex, ${needsAI.length} need AI processing`);

        if (needsAI.length === 0) {
            return entries.map(e => ({
                ...e,
                extractedBy: 'regex'
            }));
        }

        // AI处理需要处理的文献
        const aiResults = await batchProcessReferences(
            needsAI.map(e => e.rawText),
            apiConfig,
            sourceLang,
            progressCallback
        );

        // 合并结果
        let finalResults = [...alreadyExtracted];

        aiResults.forEach((aiResult, idx) => {
            const original = needsAI[idx];
            finalResults.push({
                ...original,
                ...aiResult,
                extractedBy: 'ai',
                confidence: aiResult.error ? 0 : 0.8 // AI提取的置信度
            });
        });

        // 按原始索引排序
        finalResults.sort((a, b) => (a.index || 0) - (b.index || 0));

        // 可选：使用DOI解析器补充DOI信息
        if (options.enrichWithDOI && typeof window.DOIResolver !== 'undefined') {
            console.log('[ReferenceAIProcessor] Enriching with DOI information...');
            finalResults = await enrichWithDOI(finalResults, progressCallback);
        }

        return finalResults;
    }

    /**
     * 使用DOI解析器补充文献的DOI信息
     * @param {Array} references - 文献列表
     * @param {Function} progressCallback - 进度回调
     * @returns {Promise<Array>} 补充后的文献列表
     */
    async function enrichWithDOI(references, progressCallback = null) {
        if (!window.DOIResolver) {
            console.warn('[ReferenceAIProcessor] DOIResolver not available, skipping DOI enrichment');
            return references;
        }

        // 筛选出需要查询DOI的文献（没有DOI或DOI不完整）
        const needsDOI = references.filter(ref => !ref.doi && ref.title);

        if (needsDOI.length === 0) {
            console.log('[ReferenceAIProcessor] All references already have DOI');
            return references;
        }

        console.log(`[ReferenceAIProcessor] Querying DOI for ${needsDOI.length} references`);

        // 创建DOI解析器
        const resolver = window.DOIResolver.create({
            queryOrder: ['crossref', 'openalex', 'pubmed'],
            timeout: 5000
        });

        // 批量解析
        const doiResults = await resolver.batchResolve(needsDOI, (progress) => {
            if (progressCallback) {
                progressCallback({
                    phase: 'doi-enrichment',
                    completed: progress.completed,
                    total: progress.total,
                    current: progress.current
                });
            }
        });

        // 合并DOI信息回原始文献列表
        const enrichedReferences = references.map(ref => {
            if (ref.doi) return ref; // 已有DOI，跳过

            const doiResult = doiResults.find(r => r.original === ref);
            if (doiResult && doiResult.resolved) {
                return {
                    ...ref,
                    doi: doiResult.resolved.doi,
                    url: doiResult.resolved.url || ref.url,
                    // 可选：用DOI查询结果补充缺失的字段
                    authors: ref.authors || doiResult.resolved.authors,
                    year: ref.year || doiResult.resolved.year,
                    journal: ref.journal || doiResult.resolved.journal,
                    doiSource: doiResult.resolved.source,
                    doiConfidence: doiResult.resolved.confidence
                };
            }

            return ref;
        });

        const successCount = enrichedReferences.filter(r => r.doi).length;
        console.log(`[ReferenceAIProcessor] DOI enrichment complete: ${successCount}/${references.length} now have DOI`);

        return enrichedReferences;
    }

    /**
     * 构建API配置（兼容现有的翻译API）
     */
    function buildAPIConfig(model, apiKey, modelConfig = null) {
        if (model === 'custom' && modelConfig) {
            const endpoint = modelConfig.apiEndpoint || modelConfig.apiBaseUrl;
            return {
                endpoint: endpoint,
                modelName: modelConfig.modelId,
                apiKey: apiKey,
                headers: { 'Content-Type': 'application/json' },
                bodyBuilder: (sys, user) => {
                    const messages = [
                        { role: 'system', content: sys },
                        { role: 'user', content: user }
                    ];

                    return {
                        model: modelConfig.modelId,
                        messages: messages,
                        temperature: 0.1,
                        max_tokens: modelConfig.max_tokens || 4000
                    };
                },
                responseExtractor: (data) => data?.choices?.[0]?.message?.content
            };
        }

        // 预设模型配置
        const configs = {
            'mistral': {
                endpoint: 'https://api.mistral.ai/v1/chat/completions',
                modelName: 'mistral-large-latest'
            },
            'deepseek': {
                endpoint: 'https://api.deepseek.com/v1/chat/completions',
                modelName: 'deepseek-chat'
            },
            'gemini': {
                endpoint: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent',
                modelName: 'gemini-2.0-flash',
                bodyBuilder: (sys, user) => ({
                    contents: [{
                        role: 'user',
                        parts: [{ text: `${sys}\n\n${user}` }]
                    }],
                    generationConfig: {
                        temperature: 0.1,
                        maxOutputTokens: 4000
                    }
                }),
                responseExtractor: (data) => data?.candidates?.[0]?.content?.parts?.[0]?.text
            }
        };

        const config = configs[model];
        if (!config) {
            throw new Error(`Unsupported model: ${model}`);
        }

        return {
            ...config,
            apiKey: apiKey,
            headers: { 'Content-Type': 'application/json' },
            bodyBuilder: config.bodyBuilder || ((sys, user) => ({
                model: config.modelName,
                messages: [
                    { role: 'system', content: sys },
                    { role: 'user', content: user }
                ],
                temperature: 0.1
            })),
            responseExtractor: config.responseExtractor || ((data) => data?.choices?.[0]?.message?.content)
        };
    }

    // 导出API
    global.ReferenceAIProcessor = {
        batchProcessReferences,
        smartProcessReferences,
        enrichWithDOI,
        generateExtractionPrompt,
        buildAPIConfig,
        BATCH_SIZE,
        version: '1.1.0'
    };

    console.log('[ReferenceAIProcessor] Reference AI processor loaded.');

})(window);