paper-burner/js/process/ocr-adapters/mineru-adapter.js

459 lines
15 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// process/ocr-adapters/mineru-adapter.js
// MinerU OCR 适配器
/**
* MinerU OCR 适配器
* 特性:
* - 返回 full.md + images/
* - 保存原始 PDF 和 JSON供 V2 结构化翻译使用)
*/
class MinerUOcrAdapter extends OcrAdapter {
constructor(config) {
super(config);
this.token = config.token;
this.workerUrl = config.workerUrl;
this.authKey = config.authKey; // Worker Auth Key (可选)
this.tokenMode = config.tokenMode || 'backend'; // 'backend' = 后端代理, 'frontend' = 前端透传
this.options = {
is_ocr: config.enableOcr !== false,
enable_formula: config.enableFormula !== false,
enable_table: config.enableTable !== false
};
}
/**
* 转义用于正则的字符串
*/
escapeRegex(str) {
return String(str).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/**
* 处理文件
* @param {File} file - PDF 文件
* @param {Function} onProgress - 进度回调
* @returns {Promise<Object>} { markdown, images, metadata }
*/
async processFile(file, onProgress) {
console.log('[MinerU OCR] Processing file:', file.name);
// 1. 上传文件
onProgress?.(0, 100, '上传文件到 MinerU...');
const batchId = await this.uploadFile(file);
console.log('[MinerU OCR] Batch ID:', batchId);
// 2. 轮询结果
onProgress?.(10, 100, '等待 MinerU 处理...');
const zipUrl = await this.pollResult(batchId, onProgress);
console.log('[MinerU OCR] ZIP URL:', zipUrl);
// 3. 下载并解压 ZIP
onProgress?.(90, 100, '下载并解析结果...');
const result = await this.downloadAndExtract(zipUrl);
onProgress?.(100, 100, '完成');
return result;
}
/**
* 构建请求头
* @returns {Object} headers
*/
buildHeaders() {
const headers = {};
// Worker/Local Proxy Auth Key (可选)
if (this.authKey) {
headers['X-Auth-Key'] = this.authKey;
}
// 完全后端代理模式Token 由后端从环境变量读取,前端不再传递
// 不再发送 X-MinerU-Key所有认证由 local-proxy 后端处理
console.log('[MinerU OCR] Authorization: Using backend proxy mode (token from server environment)');
return headers;
}
/**
* 上传文件
* @param {File} file
* @returns {Promise<string>} batch_id
*/
async uploadFile(file) {
const formData = new FormData();
formData.append('file', file);
formData.append('is_ocr', this.options.is_ocr.toString());
formData.append('enable_formula', this.options.enable_formula.toString());
formData.append('enable_table', this.options.enable_table.toString());
const response = await fetch(`${this.workerUrl}/mineru/upload`, {
method: 'POST',
headers: this.buildHeaders(),
body: formData
});
if (!response.ok) {
const error = await response.json().catch(() => ({ error: response.statusText }));
throw new Error(`MinerU 上传失败: ${error.error || response.statusText}`);
}
const data = await response.json();
return data.batch_id;
}
/**
* 轮询结果
* @param {string} batchId
* @param {Function} onProgress
* @returns {Promise<string>} ZIP URL
*/
async pollResult(batchId, onProgress) {
const maxAttempts = 100;
const pollInterval = 3000; // 3 秒
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${this.workerUrl}/mineru/result/${batchId}`, {
headers: this.buildHeaders()
});
if (!response.ok) {
throw new Error('MinerU 查询失败');
}
const data = await response.json();
const result = data.extract_result[0];
if (result.state === 'running' && result.extract_progress) {
const { extracted_pages, total_pages } = result.extract_progress;
const percent = Math.floor((extracted_pages / total_pages) * 70) + 10;
onProgress?.(percent, 100, `处理中: ${extracted_pages}/${total_pages}`);
}
if (result.state === 'done') {
return result.full_zip_url || result.fullZipUrl;
}
if (result.state === 'failed') {
throw new Error(result.err_msg || 'MinerU 处理失败');
}
await this.sleep(pollInterval);
}
throw new Error('MinerU 处理超时');
}
/**
* 使用 XMLHttpRequest 下载文件(更可靠的二进制下载)
* @param {string} url - 文件 URL
* @param {Object} headers - 请求头
* @param {number} timeout - 超时时间(毫秒,默认 5 分钟)
* @returns {Promise<Blob>} 文件 Blob
*/
downloadWithXHR(url, headers = {}, timeout = 300000) {
return new Promise((resolve, reject) => {
const xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.responseType = 'blob';
xhr.timeout = timeout;
// 设置请求头
for (const [key, value] of Object.entries(headers)) {
xhr.setRequestHeader(key, value);
}
xhr.onload = () => {
if (xhr.status >= 200 && xhr.status < 300) {
console.log(`[MinerU OCR] XHR download completed: ${(xhr.response.size / 1024 / 1024).toFixed(2)} MB`);
resolve(xhr.response);
} else {
reject(new Error(`XHR download failed: ${xhr.status} ${xhr.statusText}`));
}
};
xhr.onerror = () => {
reject(new Error('XHR network error - 网络错误,请检查代理服务器是否正常运行'));
};
xhr.ontimeout = () => {
reject(new Error('XHR timeout - 下载超时'));
};
xhr.onprogress = (event) => {
if (event.lengthComputable) {
const percent = ((event.loaded / event.total) * 100).toFixed(1);
console.log(`[MinerU OCR] Download progress: ${percent}% (${(event.loaded / 1024 / 1024).toFixed(2)} MB)`);
}
};
xhr.send();
});
}
/**
* 分片下载大文件
* @param {string} url - 文件 URL
* @param {Object} headers - 请求头
* @param {number} chunkSize - 每片大小(默认 10MB
* @returns {Promise<Blob>} 完整文件 Blob
*/
async downloadWithChunks(url, headers = {}, chunkSize = 10 * 1024 * 1024) {
console.log('[MinerU OCR] Starting chunked download from:', url);
// 1. 获取文件大小
const headResponse = await fetch(url, { method: 'HEAD', headers });
if (!headResponse.ok) {
throw new Error(`HEAD request failed: ${headResponse.status}`);
}
const contentLength = parseInt(headResponse.headers.get('Content-Length') || '0');
if (!contentLength || contentLength === 0) {
throw new Error('Cannot get file size (Content-Length missing)');
}
console.log(`[MinerU OCR] File size: ${(contentLength / 1024 / 1024).toFixed(2)} MB`);
// 如果文件小于 20MB直接下载但使用 XMLHttpRequest 以获得更好的进度和错误处理)
if (contentLength < 20 * 1024 * 1024) {
console.log('[MinerU OCR] File is small enough, using direct download');
return await this.downloadWithXHR(url, headers);
}
// 2. 计算分片数量
const numChunks = Math.ceil(contentLength / chunkSize);
console.log(`[MinerU OCR] Splitting into ${numChunks} chunks of ~${(chunkSize / 1024 / 1024).toFixed(1)} MB each`);
// 3. 下载每个分片
const chunks = [];
for (let i = 0; i < numChunks; i++) {
const start = i * chunkSize;
const end = Math.min(start + chunkSize - 1, contentLength - 1);
console.log(`[MinerU OCR] Downloading chunk ${i + 1}/${numChunks} (bytes ${start}-${end})`);
const rangeHeaders = {
...headers,
'Range': `bytes=${start}-${end}`
};
const response = await fetch(url, { headers: rangeHeaders });
if (!response.ok && response.status !== 206) {
throw new Error(`Chunk ${i + 1} download failed: ${response.status}`);
}
const chunkBlob = await response.blob();
chunks.push(chunkBlob);
console.log(`[MinerU OCR] Chunk ${i + 1}/${numChunks} completed: ${(chunkBlob.size / 1024 / 1024).toFixed(2)} MB`);
}
// 4. 合并所有分片
console.log('[MinerU OCR] Merging chunks...');
const fullBlob = new Blob(chunks);
console.log(`[MinerU OCR] Chunked download completed: ${(fullBlob.size / 1024 / 1024).toFixed(2)} MB`);
return fullBlob;
}
/**
* 下载并解压 ZIP
* @param {string} zipUrl
* @returns {Promise<Object>} { markdown, images, metadata }
*/
async downloadAndExtract(zipUrl) {
console.log('[MinerU OCR] Downloading ZIP from:', zipUrl);
// 通过 Worker 代理下载 ZIP
let finalUrl = zipUrl;
const headers = {};
if (this.workerUrl) {
const base = this.workerUrl.replace(/\/+$/, '');
finalUrl = `${base}/mineru/zip?url=${encodeURIComponent(zipUrl)}`;
if (this.authKey) headers['X-Auth-Key'] = this.authKey;
}
// 尝试分片下载(解决大文件传输问题)
let zipBlob;
try {
zipBlob = await this.downloadWithChunks(finalUrl, headers);
} catch (chunkError) {
console.warn('[MinerU OCR] Chunked download failed, trying direct download:', chunkError.message);
// 回退到直接下载
const zipResponse = await fetch(finalUrl, { headers });
if (!zipResponse.ok) {
throw new Error(`下载 ZIP 失败: ${zipResponse.status} ${zipResponse.statusText}`);
}
zipBlob = await zipResponse.blob();
}
console.log(`[MinerU OCR] Downloaded ZIP size: ${(zipBlob.size / 1024 / 1024).toFixed(2)} MB`);
// 解压(使用 JSZip
if (typeof JSZip === 'undefined') {
throw new Error('JSZip 未加载');
}
const zip = await JSZip.loadAsync(zipBlob);
// 提取 full.md
const fullMdFile = zip.file('full.md');
if (!fullMdFile) {
throw new Error('ZIP 中未找到 full.md');
}
let markdown = await fullMdFile.async('string');
// 提取 images/
const images = [];
const imageFiles = zip.file(/^images\/.+/);
for (const file of imageFiles) {
const blob = await file.async('blob');
const base64 = await this.blobToBase64(blob);
const name = file.name.replace('images/', '');
images.push({
id: name,
name: name,
data: this._ensureImageDataUri(base64, name)
});
}
// 规范图片命名(顺序命名)并统一将 Markdown/HTML 中的图片引用改为 images/<新文件名>
try {
// 1) 建立重命名表
const renameMap = new Map(); // oldName -> newName
const extRegex = /\.([a-z0-9]+)$/i;
images.forEach((img, idx) => {
const m = String(img.name || img.id || '').match(extRegex);
const ext = (m && m[1]) ? m[1].toLowerCase() : 'jpg';
const newName = `img-${String(idx+1).padStart(3,'0')}.${ext}`;
if ((img.name && img.name !== newName) || (img.id && img.id !== newName)) {
renameMap.set(img.name || img.id, newName);
}
});
// 2) 应用重命名到 Markdown/HTML尽量避免复杂正则
const simpleEscape = (s) => String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const replaceRefs = (text, oldName, newName) => {
let t = text;
// Markdown 直接/相对路径
const mdVariants = [
`](images/${oldName})`,
`](./${oldName})`,
`](${oldName})`
];
mdVariants.forEach(v => { t = t.split(v).join(`](images/${newName})`); });
// 更通用:只要是 ](xxx/oldName) 也替换
try {
const reMdAny = new RegExp(`\"?\]\([^\)\\]*${simpleEscape(oldName)}\)`, 'g');
t = t.replace(reMdAny, m => m.replace(new RegExp(simpleEscape(oldName), 'g'), newName).replace(/\]\(/, '](images/'));
} catch(_) {}
// HTML src="...oldName"
try {
const reSrc = new RegExp(`src=["']([^"']*/)?${simpleEscape(oldName)}["']`, 'gi');
t = t.replace(reSrc, (m) => `src="images/${newName}"`);
} catch(_) {}
return t;
};
renameMap.forEach((newName, oldName) => {
markdown = replaceRefs(markdown, oldName, newName);
});
// 3) 同步更新 images 列表的 id/name
images.forEach((img, idx) => {
const m = String(img.name || img.id || '').match(extRegex);
const ext = (m && m[1]) ? m[1].toLowerCase() : 'jpg';
const newName = `img-${String(idx+1).padStart(3,'0')}.${ext}`;
img.id = newName;
img.name = newName;
img.data = this._ensureImageDataUri(img.data, newName);
});
} catch (e) { console.warn('[MinerU OCR] 图片路径重写失败(忽略):', e); }
// 提取元数据(用于 V2 结构化翻译)
let layoutJson = null;
let contentListJson = null;
let originPdf = null;
try {
const layoutFile = zip.file('layout.json');
if (layoutFile) {
const layoutStr = await layoutFile.async('string');
layoutJson = JSON.parse(layoutStr);
}
const contentListFile = zip.file(/content_list\.json$/)[0];
if (contentListFile) {
const contentListStr = await contentListFile.async('string');
contentListJson = JSON.parse(contentListStr);
}
const originPdfFile = zip.file(/_origin\.pdf$/)[0];
if (originPdfFile) {
originPdf = await originPdfFile.async('blob');
}
} catch (e) {
console.warn('[MinerU OCR] 提取元数据时出错:', e);
}
return {
markdown,
images,
metadata: {
engine: 'mineru',
layoutJson,
contentListJson,
originalPdf: originPdf,
// V2 预留接口
supportsStructuredTranslation: true
}
};
}
_ensureImageDataUri(data, name) {
try {
if (!data) return '';
if (typeof data === 'string' && data.startsWith('data:')) {
// 修正为正确的 mime
if (/^data:application\/octet-stream;base64,/i.test(data)) {
const mime = this._guessMimeByName(name);
return data.replace(/^data:application\/octet-stream/i, `data:${mime}`);
}
return data;
}
const mime = this._guessMimeByName(name);
return `data:${mime};base64,${data}`;
} catch { return data; }
}
_guessMimeByName(name) {
const ext = String(name || '').split('.').pop().toLowerCase();
if (ext === 'png') return 'image/png';
if (ext === 'gif') return 'image/gif';
if (ext === 'webp') return 'image/webp';
if (ext === 'bmp') return 'image/bmp';
if (ext === 'svg') return 'image/svg+xml';
return 'image/jpeg';
}
/**
* 延迟辅助函数
* @param {number} ms - 毫秒
* @returns {Promise<void>}
*/
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// 导出到全局
if (typeof window !== 'undefined') {
window.MinerUOcrAdapter = MinerUOcrAdapter;
}
// 模块化导出
if (typeof module !== 'undefined' && module.exports) {
module.exports = MinerUOcrAdapter;
}