paper-burner/js/processing/content-list-to-chunks.js

244 lines
8.1 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// js/processing/content-list-to-chunks.js
// 将 MinerU contentList 转换为 chunks 格式
(function(global) {
'use strict';
/**
* 从 contentList 生成 chunks
* @param {Array} contentListJson - MinerU content_list.json 数据
* @param {Array} translatedContentList - 翻译后的内容列表
* @param {number} maxTokensPerChunk - 每个chunk的最大token数默认2000
* @returns {Object} { ocrChunks: Array<string>, translatedChunks: Array<string> }
*/
function generateChunksFromContentList(contentListJson, translatedContentList, maxTokensPerChunk = 2000) {
if (!contentListJson || !Array.isArray(contentListJson)) {
console.warn('[ContentListToChunks] Invalid contentListJson');
return { ocrChunks: [], translatedChunks: [] };
}
// 按页面分组
const pageGroups = groupByPage(contentListJson);
const translatedPageGroups = groupByPage(translatedContentList || []);
const ocrChunks = [];
const translatedChunks = [];
// 遍历每一页
Object.keys(pageGroups).sort((a, b) => parseInt(a) - parseInt(b)).forEach(pageIdx => {
const pageItems = pageGroups[pageIdx];
const translatedPageItems = translatedPageGroups[pageIdx] || [];
// 将页面内容按token限制分块
const pageChunks = chunkPageContent(pageItems, translatedPageItems, maxTokensPerChunk);
ocrChunks.push(...pageChunks.ocrChunks);
translatedChunks.push(...pageChunks.translatedChunks);
});
console.log(`[ContentListToChunks] 生成了 ${ocrChunks.length} 个chunks`);
return {
ocrChunks,
translatedChunks
};
}
/**
* 按页面分组
*/
function groupByPage(contentList) {
const groups = {};
contentList.forEach(item => {
const pageIdx = item.page_idx !== undefined ? item.page_idx : 0;
if (!groups[pageIdx]) {
groups[pageIdx] = [];
}
groups[pageIdx].push(item);
});
return groups;
}
/**
* 将单页内容分块
*/
function chunkPageContent(ocrItems, translatedItems, maxTokens) {
const ocrChunks = [];
const translatedChunks = [];
// 按类型分组text, image, table等
const sections = groupBySection(ocrItems);
const translatedSections = groupBySection(translatedItems);
let currentOcrChunk = [];
let currentTranslatedChunk = [];
let currentTokens = 0;
sections.forEach((section, idx) => {
const translatedSection = translatedSections[idx] || section;
// 提取文本内容
const ocrText = extractTextFromSection(section);
const translatedText = extractTextFromSection(translatedSection);
// 估算token数
const tokens = estimateTokens(ocrText);
// 如果当前块加上这个section会超过限制先保存当前块
if (currentTokens + tokens > maxTokens && currentOcrChunk.length > 0) {
ocrChunks.push(currentOcrChunk.join('\n\n'));
translatedChunks.push(currentTranslatedChunk.join('\n\n'));
currentOcrChunk = [];
currentTranslatedChunk = [];
currentTokens = 0;
}
// 添加到当前块
if (ocrText) {
currentOcrChunk.push(ocrText);
currentTranslatedChunk.push(translatedText || ocrText);
currentTokens += tokens;
}
});
// 保存最后一个块
if (currentOcrChunk.length > 0) {
ocrChunks.push(currentOcrChunk.join('\n\n'));
translatedChunks.push(currentTranslatedChunk.join('\n\n'));
}
return { ocrChunks, translatedChunks };
}
/**
* 按section分组相邻的相同类型元素合并
*/
function groupBySection(items) {
const sections = [];
let currentSection = [];
let lastType = null;
items.forEach(item => {
const type = item.type || 'text';
// 如果类型改变开始新section
if (type !== lastType && currentSection.length > 0) {
sections.push(currentSection);
currentSection = [];
}
currentSection.push(item);
lastType = type;
});
if (currentSection.length > 0) {
sections.push(currentSection);
}
return sections;
}
/**
* 从section提取文本
*/
function extractTextFromSection(section) {
if (!section || !Array.isArray(section)) {
return '';
}
const texts = section.map(item => {
if (item.type === 'text' || !item.type) {
return item.text || item.content || '';
} else if (item.type === 'title') {
const level = item.level || 1;
const prefix = '#'.repeat(Math.min(level, 6));
return `${prefix} ${item.text || item.content || ''}`;
} else if (item.type === 'table') {
// 表格内容
return item.text || item.markdown || item.content || '';
} else if (item.type === 'image') {
// 图片描述
return item.caption || item.text || '';
}
return item.text || item.content || '';
});
return texts.filter(Boolean).join('\n\n');
}
/**
* 估算token数
*/
function estimateTokens(text) {
if (typeof global.estimateTokenCount === 'function') {
return global.estimateTokenCount(text);
}
// 简单估算中文按2字符/token英文按4字符/token
const chineseChars = (text.match(/[\u4e00-\u9fa5]/g) || []).length;
const totalChars = text.length;
const englishChars = totalChars - chineseChars;
return Math.ceil(chineseChars / 2 + englishChars / 4);
}
/**
* 从完整的OCR和翻译文本生成chunks备用方法
*/
function generateChunksFromFullText(ocrText, translatedText, maxTokensPerChunk = 2000) {
const estimator = typeof global.estimateTokenCount === 'function'
? global.estimateTokenCount
: (t) => Math.ceil(t.length / 4);
const splitter = typeof global.splitMarkdownIntoChunks === 'function'
? global.splitMarkdownIntoChunks
: simpleSplit;
const ocrChunks = splitter(ocrText, maxTokensPerChunk, '[GenerateChunks OCR]');
const translatedChunks = translatedText
? splitter(translatedText, maxTokensPerChunk, '[GenerateChunks Translation]')
: ocrChunks.map(() => '');
return { ocrChunks, translatedChunks };
}
/**
* 简单分割(备用)
*/
function simpleSplit(text, tokenLimit, logContext) {
const lines = text.split('\n');
const chunks = [];
let currentChunk = [];
let currentTokens = 0;
lines.forEach(line => {
const lineTokens = Math.ceil(line.length / 4);
if (currentTokens + lineTokens > tokenLimit && currentChunk.length > 0) {
chunks.push(currentChunk.join('\n'));
currentChunk = [];
currentTokens = 0;
}
currentChunk.push(line);
currentTokens += lineTokens;
});
if (currentChunk.length > 0) {
chunks.push(currentChunk.join('\n'));
}
return chunks;
}
// 导出到全局
global.generateChunksFromContentList = generateChunksFromContentList;
global.generateChunksFromFullText = generateChunksFromFullText;
console.log('[ContentListToChunks] Content list to chunks converter loaded.');
})(window);