paper-burner/js/process/ocr-adapters/doc2x-adapter.js

546 lines
18 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// process/ocr-adapters/doc2x-adapter.js
// Doc2X OCR 适配器
/**
* Doc2X OCR 适配器
* 特性:
* - 使用 status 端点获取永久 Markdown
* - 尝试导出带图片版本3次重试
* - 失败后降级到 CDN 图床版本
* - 使用 Dollar 公式模式
*/
class Doc2XOcrAdapter extends OcrAdapter {
constructor(config) {
super(config);
this.token = config.token;
this.workerUrl = config.workerUrl;
this.authKey = config.authKey; // Worker Auth Key (可选)
this.tokenMode = config.tokenMode || 'frontend'; // 'frontend' 或 'worker'
this.maxExportRetries = 3; // 导出重试次数
}
/**
* 处理文件
* @param {File} file - PDF 文件
* @param {Function} onProgress - 进度回调
* @returns {Promise<Object>} { markdown, images, metadata }
*/
async processFile(file, onProgress) {
console.log('[Doc2X OCR] Processing file:', file.name);
// 1. 上传文件
onProgress?.(0, 100, '上传文件到 Doc2X...');
const uid = await this.uploadFile(file);
console.log('[Doc2X OCR] UID:', uid);
// 2. 轮询状态,获取永久 Markdown
onProgress?.(10, 100, '等待 Doc2X 处理...');
const statusData = await this.pollStatus(uid, onProgress);
console.log('[Doc2X OCR] Status data received');
// 3. 优先直接使用状态端点返回的“永久 Markdown”包含官方图床链接
const preferStatusMd = (localStorage.getItem('ocrDoc2XPreferStatusMd') || 'true') !== 'false';
const statusMarkdown = statusData && typeof statusData.markdown === 'string' ? statusData.markdown : '';
if (preferStatusMd && statusMarkdown) {
onProgress?.(95, 100, '使用状态返回的 Markdown...');
onProgress?.(100, 100, '完成');
return {
markdown: statusMarkdown,
images: [],
metadata: {
engine: 'doc2x',
source: 'status',
pageCount: statusData.page_count || 0
}
};
}
// 可选:若明确配置使用导出 ZIP则尝试带图片版本
onProgress?.(85, 100, '尝试导出带图片版本...');
const exportResult = await this.tryExportWithImages(uid, statusData, onProgress);
onProgress?.(100, 100, '完成');
return exportResult;
}
/**
* 构建请求头
* @param {boolean} isJsonContent - 是否为 JSON 内容(需要 Content-Type
* @returns {Object} headers
*/
buildHeaders(isJsonContent = false) {
const headers = {};
// Worker Auth Key (可选)
if (this.authKey) {
headers['X-Auth-Key'] = this.authKey;
}
// Doc2X Token (仅在前端透传模式)
if (this.tokenMode === 'frontend' && this.token) {
headers['X-Doc2X-Key'] = this.token;
const tokenPreview = this.token.length > 12
? `${this.token.substring(0, 6)}...${this.token.substring(this.token.length - 6)}`
: this.token;
console.log(`[Doc2X OCR] Authorization: Using frontend mode with token (${tokenPreview})`);
} else if (this.tokenMode === 'frontend' && !this.token) {
console.warn('[Doc2X OCR] Authorization: Frontend mode selected but no token provided');
} else if (this.tokenMode === 'worker') {
console.log('[Doc2X OCR] Authorization: Using worker mode (token from environment)');
} else {
console.warn('[Doc2X OCR] Authorization: Unknown token mode:', this.tokenMode);
}
// JSON Content-Type
if (isJsonContent) {
headers['Content-Type'] = 'application/json';
}
return headers;
}
/**
* 上传文件
* @param {File} file
* @returns {Promise<string>} uid
*/
async uploadFile(file) {
const formData = new FormData();
formData.append('file', file);
formData.append('ocr', 'true'); // 启用 OCR
formData.append('formula_mode', 'dollar'); // Dollar 公式模式
const response = await fetch(`${this.workerUrl}/doc2x/upload`, {
method: 'POST',
headers: this.buildHeaders(),
body: formData
});
if (!response.ok) {
const error = await response.json().catch(() => ({ error: response.statusText }));
throw new Error(`Doc2X 上传失败: ${error.error || response.statusText}`);
}
const data = await response.json();
return data.uid;
}
/**
* 轮询状态
* @param {string} uid
* @param {Function} onProgress
* @returns {Promise<Object>} 状态数据(包含永久 Markdown
*/
async pollStatus(uid, onProgress) {
const maxAttempts = 100;
const pollInterval = 3000; // 3 秒
for (let i = 0; i < maxAttempts; i++) {
const response = await fetch(`${this.workerUrl}/doc2x/status/${uid}`, {
headers: this.buildHeaders()
});
if (!response.ok) {
throw new Error('Doc2X 查询失败');
}
const data = await response.json();
// 更新进度
if (data.progress !== undefined && data.progress !== null) {
// 兼容 0-1 或 0-100 两种进度表达
const raw = Number(data.progress);
const progressPct = isFinite(raw) ? (raw <= 1 ? Math.round(raw * 100) : Math.round(raw)) : 0;
const percent = Math.min(10 + Math.floor(progressPct * 0.7), 95);
onProgress?.(percent, 100, `处理中: ${progressPct}%`);
}
// 完成
if (data.status === 'success' && data.result) {
return data.result;
}
// 失败
if (data.status === 'failed') {
throw new Error(data.error || 'Doc2X 处理失败');
}
await this.sleep(pollInterval);
}
throw new Error('Doc2X 处理超时');
}
/**
* 尝试导出带图片版本最多3次重试
* @param {string} uid
* @param {Object} statusData - 状态端点返回的数据(包含永久 Markdown 和 CDN 图片)
* @returns {Promise<Object>} { markdown, images, metadata }
*/
async tryExportWithImages(uid, statusData, onProgress) {
// 尝试导出
for (let attempt = 1; attempt <= this.maxExportRetries; attempt++) {
try {
console.log(`[Doc2X OCR] Export attempt ${attempt}/${this.maxExportRetries}`);
onProgress?.(90, 100, `导出中 (${attempt}/${this.maxExportRetries})...`);
const exportUrl = await this.requestExport(uid);
const result = await this.downloadAndExtractExport(exportUrl, statusData);
onProgress?.(98, 100, '导出完成,正在解析 ZIP...');
console.log('[Doc2X OCR] Export succeeded');
return result;
} catch (error) {
console.warn(`[Doc2X OCR] Export attempt ${attempt} failed:`, error.message);
if (attempt === this.maxExportRetries) {
// 所有重试都失败,降级到 CDN 图床版本
console.log('[Doc2X OCR] All export attempts failed, falling back to CDN images');
return this.useCdnImages(statusData);
}
// 等待一下再重试
await this.sleep(2000);
}
}
}
/**
* 请求导出
* @param {string} uid
* @returns {Promise<string>} 导出 ZIP URL
*/
async requestExport(uid) {
// 1) 触发导出任务(使用 to:'md' + formula_mode:'dollar'
const triggerResp = await fetch(`${this.workerUrl}/doc2x/convert`, {
method: 'POST',
headers: this.buildHeaders(true),
body: JSON.stringify({
uid,
to: 'md',
formula_mode: 'dollar'
})
});
if (!triggerResp.ok) {
throw new Error('Doc2X 导出请求失败');
}
// 2) 轮询导出结果,直到获得下载 URL
const headers = this.buildHeaders();
const base = this.workerUrl.replace(/\/+$/, '');
for (let i = 0; i < 50; i++) {
await this.sleep(2000);
const res = await fetch(`${base}/doc2x/convert/result/${uid}`, { headers });
if (!res.ok) continue;
const data = await res.json();
const url = data.url || data.zip_url || data.full_zip_url || data.fullZipUrl;
if (data.status === 'success' && url) {
return url;
}
}
throw new Error('Doc2X 导出超时');
}
/**
* 下载并解压导出的 ZIP成功导出带图片版本
* @param {string} zipUrl
* @param {Object} statusData - 备用的状态数据
* @returns {Promise<Object>} { markdown, images, metadata }
*/
/**
* 分片下载大文件
* @param {string} url - 文件 URL
* @param {Object} headers - 请求头
* @param {number} chunkSize - 每片大小(默认 10MB
* @returns {Promise<Blob>} 完整文件 Blob
*/
async downloadWithChunks(url, headers = {}, chunkSize = 10 * 1024 * 1024) {
console.log('[Doc2X OCR] Starting chunked download from:', url);
// 1. 获取文件大小
const headResponse = await fetch(url, { method: 'HEAD', headers });
if (!headResponse.ok) {
throw new Error(`HEAD request failed: ${headResponse.status}`);
}
const contentLength = parseInt(headResponse.headers.get('Content-Length') || '0');
if (!contentLength || contentLength === 0) {
throw new Error('Cannot get file size (Content-Length missing)');
}
console.log(`[Doc2X OCR] File size: ${(contentLength / 1024 / 1024).toFixed(2)} MB`);
// 如果文件小于 20MB直接下载
if (contentLength < 20 * 1024 * 1024) {
console.log('[Doc2X OCR] File is small enough, using direct download');
const response = await fetch(url, { headers });
if (!response.ok) throw new Error(`Download failed: ${response.status}`);
return await response.blob();
}
// 2. 计算分片数量
const numChunks = Math.ceil(contentLength / chunkSize);
console.log(`[Doc2X OCR] Splitting into ${numChunks} chunks of ~${(chunkSize / 1024 / 1024).toFixed(1)} MB each`);
// 3. 下载每个分片
const chunks = [];
for (let i = 0; i < numChunks; i++) {
const start = i * chunkSize;
const end = Math.min(start + chunkSize - 1, contentLength - 1);
console.log(`[Doc2X OCR] Downloading chunk ${i + 1}/${numChunks} (bytes ${start}-${end})`);
const rangeHeaders = {
...headers,
'Range': `bytes=${start}-${end}`
};
const response = await fetch(url, { headers: rangeHeaders });
if (!response.ok && response.status !== 206) {
throw new Error(`Chunk ${i + 1} download failed: ${response.status}`);
}
const chunkBlob = await response.blob();
chunks.push(chunkBlob);
console.log(`[Doc2X OCR] Chunk ${i + 1}/${numChunks} completed: ${(chunkBlob.size / 1024 / 1024).toFixed(2)} MB`);
}
// 4. 合并所有分片
console.log('[Doc2X OCR] Merging chunks...');
const fullBlob = new Blob(chunks);
console.log(`[Doc2X OCR] Chunked download completed: ${(fullBlob.size / 1024 / 1024).toFixed(2)} MB`);
return fullBlob;
}
async downloadAndExtractExport(zipUrl, statusData) {
console.log('[Doc2X OCR] Downloading ZIP from:', zipUrl);
// 通过 Worker 代理下载 ZIP避免浏览器直连跨域
let finalUrl = zipUrl;
try {
if (this.workerUrl) {
const base = this.workerUrl.replace(/\/+$/, '');
finalUrl = `${base}/doc2x/zip?url=${encodeURIComponent(zipUrl)}`;
}
} catch {}
const headers = {};
if (this.authKey) headers['X-Auth-Key'] = this.authKey;
// 尝试分片下载(解决大文件传输问题)
let zipBlob;
try {
zipBlob = await this.downloadWithChunks(finalUrl, headers);
} catch (chunkError) {
console.warn('[Doc2X OCR] Chunked download failed, trying direct download:', chunkError.message);
// 回退到直接下载
const zipResponse = await fetch(finalUrl, { headers });
if (!zipResponse.ok) {
throw new Error('下载导出 ZIP 失败');
}
zipBlob = await zipResponse.blob();
}
console.log(`[Doc2X OCR] Downloaded ZIP size: ${(zipBlob.size / 1024 / 1024).toFixed(2)} MB`);
// 解压(使用 JSZip
if (typeof JSZip === 'undefined') {
throw new Error('JSZip 未加载');
}
const zip = await JSZip.loadAsync(zipBlob);
// 提取 Markdown 文件(可能是 output.md 或其他名称)
const mdFiles = zip.file(/\.md$/);
if (mdFiles.length === 0) {
throw new Error('ZIP 中未找到 Markdown 文件');
}
const markdown = await mdFiles[0].async('string');
// 提取图片
const images = [];
const imageFiles = zip.file(/\.(png|jpg|jpeg|gif|webp)$/i);
for (const file of imageFiles) {
const blob = await file.async('blob');
const base64 = await this.blobToBase64(blob);
const imageName = file.name.split('/').pop(); // 获取文件名
images.push({
id: imageName,
name: imageName,
data: this._ensureImageDataUri(base64, imageName)
});
}
// 规范图片命名并替换 Markdown/HTML 中的图片路径为 images/<新名>
let processedMarkdown = markdown;
try {
const extRegex = /\.([a-z0-9]+)$/i;
const renameMap = new Map();
images.forEach((img, idx) => {
const m = String(img.name || img.id || '').match(extRegex);
const ext = (m && m[1]) ? m[1].toLowerCase() : 'jpg';
const newName = `img-${String(idx+1).padStart(3,'0')}.${ext}`;
if ((img.name && img.name !== newName) || (img.id && img.id !== newName)) {
renameMap.set(img.name || img.id, newName);
}
});
// 替换 Markdown 与 HTML尽量避免复杂正则
const simpleEscape = (s) => String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const replaceRefs = (text, oldName, newName) => {
let t = text;
const mdVariants = [
`](images/${oldName})`,
`](./${oldName})`,
`](${oldName})`
];
mdVariants.forEach(v => { t = t.split(v).join(`](images/${newName})`); });
try {
const reMdAny = new RegExp(`\"?\]\([^\)\\]*${simpleEscape(oldName)}\)`, 'g');
t = t.replace(reMdAny, m => m.replace(new RegExp(simpleEscape(oldName), 'g'), newName).replace(/\]\(/, '](images/'));
} catch(_) {}
try {
const reSrc = new RegExp(`src=["']([^"']*/)?${simpleEscape(oldName)}["']`, 'gi');
t = t.replace(reSrc, (m) => `src="images/${newName}"`);
} catch(_) {}
return t;
};
renameMap.forEach((newName, oldName) => {
processedMarkdown = replaceRefs(processedMarkdown, oldName, newName);
});
// 更新 images 列表
images.forEach((img, idx) => {
const m = String(img.name || img.id || '').match(extRegex);
const ext = (m && m[1]) ? m[1].toLowerCase() : 'jpg';
const newName = `img-${String(idx+1).padStart(3,'0')}.${ext}`;
img.id = newName;
img.name = newName;
img.data = this._ensureImageDataUri(img.data, newName);
});
} catch (e) { console.warn('[Doc2X OCR] 顺序命名替换失败(忽略):', e); }
return {
markdown: processedMarkdown,
images,
metadata: {
engine: 'doc2x',
source: 'export', // 来自导出(带图片)
pageCount: statusData.page_count || 0
}
};
}
/**
* 使用 CDN 图床版本(导出失败时的降级方案)
* @param {Object} statusData - 状态端点返回的数据
* @returns {Promise<Object>} { markdown, images, metadata }
*/
async useCdnImages(statusData) {
const markdown = statusData.markdown || '';
// 提取 Markdown 中的 CDN 图片 URL
const cdnImages = [];
const imageRegex = /!\[([^\]]*)\]\((https?:\/\/[^\)]+)\)/g;
let match;
let imageIndex = 0;
while ((match = imageRegex.exec(markdown)) !== null) {
const altText = match[1];
const cdnUrl = match[2];
imageIndex++;
const imageId = `cdn_image_${imageIndex}.png`;
try {
// 下载 CDN 图片并转为 base64
const response = await fetch(cdnUrl);
if (response.ok) {
const blob = await response.blob();
const base64 = await this.blobToBase64(blob);
cdnImages.push({
id: imageId,
data: base64,
originalUrl: cdnUrl
});
}
} catch (error) {
console.warn(`[Doc2X OCR] Failed to download CDN image: ${cdnUrl}`, error);
}
}
// 替换 Markdown 中的 CDN URL 为 images/ 路径
let processedMarkdown = markdown;
imageIndex = 0;
processedMarkdown = processedMarkdown.replace(imageRegex, (match, altText, url) => {
imageIndex++;
const imageId = `cdn_image_${imageIndex}.png`;
return `![${altText}](images/${imageId})`;
});
return {
markdown: processedMarkdown,
images: cdnImages,
metadata: {
engine: 'doc2x',
source: 'cdn', // 来自 CDN 图床
pageCount: statusData.page_count || 0,
note: 'Using CDN images as export failed after retries'
}
};
}
/**
* 转义正则表达式特殊字符
* @param {string} str
* @returns {string}
*/
escapeRegex(str) {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
_ensureImageDataUri(data, name) {
try {
if (!data) return '';
if (typeof data === 'string' && data.startsWith('data:')) {
if (/^data:application\/octet-stream;base64,/i.test(data)) {
const mime = this._guessMimeByName(name);
return data.replace(/^data:application\/octet-stream/i, `data:${mime}`);
}
return data;
}
const mime = this._guessMimeByName(name);
return `data:${mime};base64,${data}`;
} catch { return data; }
}
_guessMimeByName(name) {
const ext = String(name || '').split('.').pop().toLowerCase();
if (ext === 'png') return 'image/png';
if (ext === 'gif') return 'image/gif';
if (ext === 'webp') return 'image/webp';
if (ext === 'bmp') return 'image/bmp';
if (ext === 'svg') return 'image/svg+xml';
return 'image/jpeg';
}
}
// 导出到全局
if (typeof window !== 'undefined') {
window.Doc2XOcrAdapter = Doc2XOcrAdapter;
}
// 模块化导出
if (typeof module !== 'undefined' && module.exports) {
module.exports = Doc2XOcrAdapter;
}