paper-burner/js/processing/content-list-to-chunks.js

// js/processing/content-list-to-chunks.js
// 将 MinerU contentList 转换为 chunks 格式

(function(global) {
    'use strict';

    /**
     * 从 contentList 生成 chunks
     * @param {Array} contentListJson - MinerU content_list.json 数据
     * @param {Array} translatedContentList - 翻译后的内容列表
     * @param {number} maxTokensPerChunk - 每个chunk的最大token数（默认2000）
     * @returns {Object} { ocrChunks: Array<string>, translatedChunks: Array<string> }
     */
    function generateChunksFromContentList(contentListJson, translatedContentList, maxTokensPerChunk = 2000) {
        if (!contentListJson || !Array.isArray(contentListJson)) {
            console.warn('[ContentListToChunks] Invalid contentListJson');
            return { ocrChunks: [], translatedChunks: [] };
        }

        // 按页面分组
        const pageGroups = groupByPage(contentListJson);
        const translatedPageGroups = groupByPage(translatedContentList || []);

        const ocrChunks = [];
        const translatedChunks = [];

        // 遍历每一页
        Object.keys(pageGroups).sort((a, b) => parseInt(a) - parseInt(b)).forEach(pageIdx => {
            const pageItems = pageGroups[pageIdx];
            const translatedPageItems = translatedPageGroups[pageIdx] || [];

            // 将页面内容按token限制分块
            const pageChunks = chunkPageContent(pageItems, translatedPageItems, maxTokensPerChunk);

            ocrChunks.push(...pageChunks.ocrChunks);
            translatedChunks.push(...pageChunks.translatedChunks);
        });

        console.log(`[ContentListToChunks] 生成了 ${ocrChunks.length} 个chunks`);

        return {
            ocrChunks,
            translatedChunks
        };
    }

    /**
     * 按页面分组
     */
    function groupByPage(contentList) {
        const groups = {};

        contentList.forEach(item => {
            const pageIdx = item.page_idx !== undefined ? item.page_idx : 0;
            if (!groups[pageIdx]) {
                groups[pageIdx] = [];
            }
            groups[pageIdx].push(item);
        });

        return groups;
    }

    /**
     * 将单页内容分块
     */
    function chunkPageContent(ocrItems, translatedItems, maxTokens) {
        const ocrChunks = [];
        const translatedChunks = [];

        // 按类型分组（text, image, table等）
        const sections = groupBySection(ocrItems);
        const translatedSections = groupBySection(translatedItems);

        let currentOcrChunk = [];
        let currentTranslatedChunk = [];
        let currentTokens = 0;

        sections.forEach((section, idx) => {
            const translatedSection = translatedSections[idx] || section;

            // 提取文本内容
            const ocrText = extractTextFromSection(section);
            const translatedText = extractTextFromSection(translatedSection);

            // 估算token数
            const tokens = estimateTokens(ocrText);

            // 如果当前块加上这个section会超过限制，先保存当前块
            if (currentTokens + tokens > maxTokens && currentOcrChunk.length > 0) {
                ocrChunks.push(currentOcrChunk.join('\n\n'));
                translatedChunks.push(currentTranslatedChunk.join('\n\n'));
                currentOcrChunk = [];
                currentTranslatedChunk = [];
                currentTokens = 0;
            }

            // 添加到当前块
            if (ocrText) {
                currentOcrChunk.push(ocrText);
                currentTranslatedChunk.push(translatedText || ocrText);
                currentTokens += tokens;
            }
        });

        // 保存最后一个块
        if (currentOcrChunk.length > 0) {
            ocrChunks.push(currentOcrChunk.join('\n\n'));
            translatedChunks.push(currentTranslatedChunk.join('\n\n'));
        }

        return { ocrChunks, translatedChunks };
    }

    /**
     * 按section分组（相邻的相同类型元素合并）
     */
    function groupBySection(items) {
        const sections = [];
        let currentSection = [];
        let lastType = null;

        items.forEach(item => {
            const type = item.type || 'text';

            // 如果类型改变，开始新section
            if (type !== lastType && currentSection.length > 0) {
                sections.push(currentSection);
                currentSection = [];
            }

            currentSection.push(item);
            lastType = type;
        });

        if (currentSection.length > 0) {
            sections.push(currentSection);
        }

        return sections;
    }

    /**
     * 从section提取文本
     */
    function extractTextFromSection(section) {
        if (!section || !Array.isArray(section)) {
            return '';
        }

        const texts = section.map(item => {
            if (item.type === 'text' || !item.type) {
                return item.text || item.content || '';
            } else if (item.type === 'title') {
                const level = item.level || 1;
                const prefix = '#'.repeat(Math.min(level, 6));
                return `${prefix} ${item.text || item.content || ''}`;
            } else if (item.type === 'table') {
                // 表格内容
                return item.text || item.markdown || item.content || '';
            } else if (item.type === 'image') {
                // 图片描述
                return item.caption || item.text || '';
            }
            return item.text || item.content || '';
        });

        return texts.filter(Boolean).join('\n\n');
    }

    /**
     * 估算token数
     */
    function estimateTokens(text) {
        if (typeof global.estimateTokenCount === 'function') {
            return global.estimateTokenCount(text);
        }
        // 简单估算：中文按2字符/token，英文按4字符/token
        const chineseChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
        const totalChars = text.length;
        const englishChars = totalChars - chineseChars;
        return Math.ceil(chineseChars / 2 + englishChars / 4);
    }

    /**
     * 从完整的OCR和翻译文本生成chunks（备用方法）
     */
    function generateChunksFromFullText(ocrText, translatedText, maxTokensPerChunk = 2000) {
        const estimator = typeof global.estimateTokenCount === 'function'
            ? global.estimateTokenCount
            : (t) => Math.ceil(t.length / 4);

        const splitter = typeof global.splitMarkdownIntoChunks === 'function'
            ? global.splitMarkdownIntoChunks
            : simpleSplit;

        const ocrChunks = splitter(ocrText, maxTokensPerChunk, '[GenerateChunks OCR]');
        const translatedChunks = translatedText
            ? splitter(translatedText, maxTokensPerChunk, '[GenerateChunks Translation]')
            : ocrChunks.map(() => '');

        return { ocrChunks, translatedChunks };
    }

    /**
     * 简单分割（备用）
     */
    function simpleSplit(text, tokenLimit, logContext) {
        const lines = text.split('\n');
        const chunks = [];
        let currentChunk = [];
        let currentTokens = 0;

        lines.forEach(line => {
            const lineTokens = Math.ceil(line.length / 4);

            if (currentTokens + lineTokens > tokenLimit && currentChunk.length > 0) {
                chunks.push(currentChunk.join('\n'));
                currentChunk = [];
                currentTokens = 0;
            }

            currentChunk.push(line);
            currentTokens += lineTokens;
        });

        if (currentChunk.length > 0) {
            chunks.push(currentChunk.join('\n'));
        }

        return chunks;
    }

    // 导出到全局
    global.generateChunksFromContentList = generateChunksFromContentList;
    global.generateChunksFromFullText = generateChunksFromFullText;

    console.log('[ContentListToChunks] Content list to chunks converter loaded.');

})(window);