paper-burner/workers/pb-ocr-proxy/src/index.js

761 lines
24 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Cloudflare Worker - OCR API Proxy
* 支持 MinerU 和 Doc2X 两种 OCR 服务
*/
const MINERU_BASE_URL = 'https://mineru.net/api/v4';
const DOC2X_BASE_URL = 'https://v2.doc2x.noedgeai.com';
export default {
async fetch(request, env, ctx) {
const url = new URL(request.url);
// OPTIONS 预检请求
if (request.method === 'OPTIONS') {
return handleCORS(request, env);
}
// 身份验证
const authResult = await authenticate(request, env);
if (!authResult.success) {
return jsonResponse({ error: authResult.error }, 401, request, env);
}
// 路由处理
try {
// ===== MinerU 路由 =====
if (url.pathname === '/mineru/upload' && request.method === 'POST') {
return await handleMinerUUpload(request, env);
}
if (url.pathname.startsWith('/mineru/result/') && request.method === 'GET') {
const batchId = url.pathname.split('/mineru/result/')[1];
return await handleMinerUGetResult(batchId, env, request);
}
// ===== Doc2X 路由 =====
if (url.pathname === '/doc2x/upload' && request.method === 'POST') {
return await handleDoc2XUpload(request, env);
}
if (url.pathname.startsWith('/doc2x/status/') && request.method === 'GET') {
const uid = url.pathname.split('/doc2x/status/')[1];
return await handleDoc2XStatus(uid, env, request);
}
if (url.pathname === '/doc2x/convert' && request.method === 'POST') {
return await handleDoc2XConvert(request, env);
}
if (url.pathname.startsWith('/doc2x/convert/result/') && request.method === 'GET') {
const uid = url.pathname.split('/doc2x/convert/result/')[1];
return await handleDoc2XConvertResult(uid, env, request);
}
// ===== 直通 ZIP 代理(解决浏览器跨域限制,支持 HEAD 和 Range 请求) =====
if (url.pathname === '/mineru/zip' && (request.method === 'GET' || request.method === 'HEAD')) {
const zipUrl = url.searchParams.get('url');
return await handleProxyZip(zipUrl, request, env);
}
if (url.pathname === '/doc2x/zip' && (request.method === 'GET' || request.method === 'HEAD')) {
const zipUrl = url.searchParams.get('url');
return await handleProxyZip(zipUrl, request, env);
}
// ===== 通用路由 =====
if (url.pathname === '/health' && request.method === 'GET') {
return jsonResponse({
status: 'ok',
timestamp: Date.now(),
services: ['mineru', 'doc2x']
}, 200, request, env);
}
return jsonResponse({ error: 'Not Found' }, 404, request, env);
} catch (error) {
console.error('Worker error:', error);
return jsonResponse({ error: error.message || 'Internal Server Error' }, 500, request, env);
}
},
};
/**
* 身份验证
*/
async function authenticate(request, env) {
if (env.ENABLE_AUTH !== 'true') {
return { success: true };
}
if (env.ALLOWED_ORIGINS) {
const origin = request.headers.get('Origin');
const allowedOrigins = env.ALLOWED_ORIGINS.split(',').map(o => o.trim());
if (!origin || !allowedOrigins.includes(origin)) {
return { success: false, error: 'Origin not allowed' };
}
}
if (env.AUTH_SECRET) {
const authKey = request.headers.get('X-Auth-Key');
if (authKey !== env.AUTH_SECRET) {
return { success: false, error: 'Invalid authentication key' };
}
}
return { success: true };
}
// ==================== MinerU 处理函数 ====================
async function handleMinerUUpload(request, env) {
try {
const contentType = request.headers.get('Content-Type') || '';
if (!contentType.includes('multipart/form-data')) {
return jsonResponse({ error: 'Content-Type must be multipart/form-data' }, 400, request, env);
}
const formData = await request.formData();
const file = formData.get('file');
if (!file) {
return jsonResponse({ error: 'No file provided' }, 400, request, env);
}
// 统一的布尔参数解析函数,默认值为 true
const parseBoolean = (value, defaultValue = true) => {
if (value === null || value === undefined || value === '') return defaultValue;
if (value === 'true' || value === '1' || value === true) return true;
if (value === 'false' || value === '0' || value === false) return false;
return defaultValue;
};
const isOcr = parseBoolean(formData.get('is_ocr'), true);
const enableFormula = parseBoolean(formData.get('enable_formula'), true);
const enableTable = parseBoolean(formData.get('enable_table'), true);
const language = formData.get('language') || 'ch';
const dataId = formData.get('data_id');
const pageRanges = formData.get('page_ranges');
console.log('[MinerU Upload] Parameters:', {
fileName: file.name,
is_ocr: isOcr,
enable_formula: enableFormula,
enable_table: enableTable,
language,
});
// 获取 Token优先从请求头其次从环境变量
const token = getToken(request, env, 'MINERU');
if (!token) {
return jsonResponse({
error: 'MinerU API Token required. Provide via X-MinerU-Key header or MINERU_API_TOKEN env variable'
}, 401, request, env);
}
// 申请上传链接
const uploadUrlResponse = await fetch(`${MINERU_BASE_URL}/file-urls/batch`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
enable_formula: enableFormula,
enable_table: enableTable,
language: language,
files: [{
name: file.name,
is_ocr: isOcr,
...(dataId && { data_id: dataId }),
...(pageRanges && { page_ranges: pageRanges }),
}],
}),
});
if (!uploadUrlResponse.ok) {
const errorText = await uploadUrlResponse.text();
throw new Error(`MinerU申请上传链接失败: ${errorText}`);
}
const uploadData = await uploadUrlResponse.json();
if (uploadData.code !== 0) {
throw new Error(`MinerU返回错误: ${uploadData.msg}`);
}
// 上传文件到 OSS
const ossUploadUrl = uploadData.data.file_urls[0];
const fileArrayBuffer = await file.arrayBuffer();
const ossUploadResponse = await fetch(ossUploadUrl, {
method: 'PUT',
body: fileArrayBuffer,
});
if (!ossUploadResponse.ok) {
throw new Error(`OSS上传失败: ${ossUploadResponse.status} ${ossUploadResponse.statusText}`);
}
return jsonResponse({
success: true,
batch_id: uploadData.data.batch_id,
file_name: file.name,
service: 'mineru'
}, 200, request, env);
} catch (error) {
console.error('MinerU upload error:', error);
return jsonResponse({ error: error.message || 'Upload failed' }, 500, request, env);
}
}
async function handleMinerUGetResult(batchId, env, request) {
try {
if (!batchId) {
return jsonResponse({ error: 'batch_id is required' }, 400, request, env);
}
// 内部健康检查:用于前端"测试连接"按钮快速校验 Token 与路由
if (batchId === '__health__') {
const token = getToken(request, env, 'MINERU');
if (!token) {
return jsonResponse({
error: 'MinerU API Token required. Provide via X-MinerU-Key header or MINERU_API_TOKEN env variable'
}, 401, request, env);
}
// 验证Token有效性向MinerU API发送一个简单的请求
try {
const tokenValidationResponse = await fetch(`${MINERU_BASE_URL}/file-urls/batch`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
files: [{
name: 'health_check.pdf',
is_ocr: true,
enable_formula: false,
enable_table: false,
language: 'ch'
}]
})
});
if (!tokenValidationResponse.ok) {
return jsonResponse({
success: false,
service: 'mineru',
error: 'Invalid Token',
message: `Token validation failed: ${tokenValidationResponse.statusText}`,
timestamp: Date.now()
}, 401, request, env);
}
const validationData = await tokenValidationResponse.json();
if (validationData.code !== 0) {
return jsonResponse({
success: false,
service: 'mineru',
error: 'Invalid Token',
message: `Token validation failed: ${validationData.msg || 'Unknown error'}`,
timestamp: Date.now()
}, 401, request, env);
}
return jsonResponse({ success: true, service: 'mineru', health: true, timestamp: Date.now() }, 200, request, env);
} catch (error) {
return jsonResponse({
success: false,
service: 'mineru',
error: 'Token validation error',
message: error.message,
timestamp: Date.now()
}, 401, request, env);
}
}
const token = getToken(request, env, 'MINERU');
if (!token) {
return jsonResponse({
error: 'MinerU API Token required. Provide via X-MinerU-Key header or MINERU_API_TOKEN env variable'
}, 401, request, env);
}
const resultResponse = await fetch(`${MINERU_BASE_URL}/extract-results/batch/${batchId}`, {
method: 'GET',
headers: {
'Authorization': `Bearer ${token}`,
'Accept': 'application/json',
},
});
if (!resultResponse.ok) {
throw new Error(`MinerU查询失败: ${resultResponse.statusText}`);
}
const resultData = await resultResponse.json();
if (resultData.code !== 0) {
throw new Error(`MinerU返回错误: ${resultData.msg}`);
}
return jsonResponse({
success: true,
service: 'mineru',
...resultData.data,
}, 200, request, env);
} catch (error) {
console.error('MinerU get result error:', error);
return jsonResponse({ error: error.message || 'Failed to get result' }, 500, request, env);
}
}
// ==================== Doc2X 处理函数 ====================
async function handleDoc2XUpload(request, env) {
try {
const contentType = request.headers.get('Content-Type') || '';
if (!contentType.includes('multipart/form-data')) {
return jsonResponse({ error: 'Content-Type must be multipart/form-data' }, 400, request, env);
}
const formData = await request.formData();
const file = formData.get('file');
if (!file) {
return jsonResponse({ error: 'No file provided' }, 400, request, env);
}
// 获取 Token优先从请求头其次从环境变量
const token = getToken(request, env, 'DOC2X');
if (!token) {
return jsonResponse({
error: 'Doc2X API Token required. Provide via X-Doc2X-Key header or DOC2X_API_TOKEN env variable'
}, 401, request, env);
}
// 1. 请求预上传链接
const preuploadResponse = await fetch(`${DOC2X_BASE_URL}/api/v2/parse/preupload`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
},
});
if (!preuploadResponse.ok) {
const errorText = await preuploadResponse.text();
throw new Error(`Doc2X预上传失败: ${errorText}`);
}
const preuploadData = await preuploadResponse.json();
if (preuploadData.code !== 'success') {
throw new Error(`Doc2X返回错误: ${preuploadData.msg || 'Unknown error'}`);
}
const { uid, url: uploadUrl } = preuploadData.data;
// 2. 上传文件到 OSS
const fileArrayBuffer = await file.arrayBuffer();
const ossUploadResponse = await fetch(uploadUrl, {
method: 'PUT',
body: fileArrayBuffer,
});
if (!ossUploadResponse.ok) {
throw new Error(`Doc2X OSS上传失败: ${ossUploadResponse.status} ${ossUploadResponse.statusText}`);
}
return jsonResponse({
success: true,
uid: uid,
file_name: file.name,
service: 'doc2x'
}, 200, request, env);
} catch (error) {
console.error('Doc2X upload error:', error);
return jsonResponse({ error: error.message || 'Upload failed' }, 500, request, env);
}
}
async function handleDoc2XStatus(uid, env, request) {
try {
if (!uid) {
return jsonResponse({ error: 'uid is required' }, 400, request, env);
}
// 内部健康检查:用于前端"测试连接"按钮快速校验 Token 与路由
if (uid === '__health__') {
const token = getToken(request, env, 'DOC2X');
if (!token) {
return jsonResponse({
error: 'Doc2X API Token required. Provide via X-Doc2X-Key header or DOC2X_API_TOKEN env variable'
}, 401, request, env);
}
// 验证Token有效性向Doc2X API发送一个简单的请求
try {
const tokenValidationResponse = await fetch(`${DOC2X_BASE_URL}/api/v2/parse/preupload`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`
}
});
if (!tokenValidationResponse.ok) {
return jsonResponse({
success: false,
service: 'doc2x',
error: 'Invalid Token',
message: `Token validation failed: ${tokenValidationResponse.statusText}`,
timestamp: Date.now()
}, 401, request, env);
}
const validationData = await tokenValidationResponse.json();
if (validationData.code !== 'success') {
return jsonResponse({
success: false,
service: 'doc2x',
error: 'Invalid Token',
message: `Token validation failed: ${validationData.msg || 'Unknown error'}`,
timestamp: Date.now()
}, 401, request, env);
}
return jsonResponse({ success: true, service: 'doc2x', health: true, timestamp: Date.now() }, 200, request, env);
} catch (error) {
return jsonResponse({
success: false,
service: 'doc2x',
error: 'Token validation error',
message: error.message,
timestamp: Date.now()
}, 401, request, env);
}
}
const token = getToken(request, env, 'DOC2X');
if (!token) {
return jsonResponse({
error: 'Doc2X API Token required. Provide via X-Doc2X-Key header or DOC2X_API_TOKEN env variable'
}, 401, request, env);
}
const statusResponse = await fetch(`${DOC2X_BASE_URL}/api/v2/parse/status?uid=${uid}`, {
method: 'GET',
headers: {
'Authorization': `Bearer ${token}`,
},
});
if (!statusResponse.ok) {
throw new Error(`Doc2X查询失败: ${statusResponse.statusText}`);
}
const statusData = await statusResponse.json();
if (statusData.code !== 'success') {
// 返回错误码
return jsonResponse({
success: false,
service: 'doc2x',
error: statusData.code,
message: statusData.msg || 'Unknown error'
}, 200, request, env);
}
return jsonResponse({
success: true,
service: 'doc2x',
...statusData.data,
}, 200, request, env);
} catch (error) {
console.error('Doc2X status error:', error);
return jsonResponse({ error: error.message || 'Failed to get status' }, 500, request, env);
}
}
async function handleDoc2XConvert(request, env) {
try {
const body = await request.json();
const { uid, to = 'md', formula_mode = 'normal', filename, merge_cross_page_forms = false } = body;
if (!uid) {
return jsonResponse({ error: 'uid is required' }, 400, request, env);
}
const token = getToken(request, env, 'DOC2X');
if (!token) {
return jsonResponse({
error: 'Doc2X API Token required. Provide via X-Doc2X-Key header or DOC2X_API_TOKEN env variable'
}, 401, request, env);
}
const convertResponse = await fetch(`${DOC2X_BASE_URL}/api/v2/convert/parse`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${token}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
uid,
to,
formula_mode,
...(filename && { filename }),
merge_cross_page_forms,
}),
});
if (!convertResponse.ok) {
const errorText = await convertResponse.text();
throw new Error(`Doc2X转换失败: ${errorText}`);
}
const convertData = await convertResponse.json();
if (convertData.code !== 'success') {
return jsonResponse({
success: false,
service: 'doc2x',
error: convertData.code,
message: convertData.msg || 'Convert failed'
}, 200, request, env);
}
return jsonResponse({
success: true,
service: 'doc2x',
...convertData.data,
}, 200, request, env);
} catch (error) {
console.error('Doc2X convert error:', error);
return jsonResponse({ error: error.message || 'Convert failed' }, 500, request, env);
}
}
async function handleDoc2XConvertResult(uid, env, request) {
try {
if (!uid) {
return jsonResponse({ error: 'uid is required' }, 400, request, env);
}
const token = getToken(request, env, 'DOC2X');
if (!token) {
return jsonResponse({
error: 'Doc2X API Token required. Provide via X-Doc2X-Key header or DOC2X_API_TOKEN env variable'
}, 401, request, env);
}
const resultResponse = await fetch(`${DOC2X_BASE_URL}/api/v2/convert/parse/result?uid=${uid}`, {
method: 'GET',
headers: {
'Authorization': `Bearer ${token}`,
},
});
if (!resultResponse.ok) {
throw new Error(`Doc2X查询转换结果失败: ${resultResponse.statusText}`);
}
const resultData = await resultResponse.json();
if (resultData.code !== 'success') {
return jsonResponse({
success: false,
service: 'doc2x',
error: resultData.code,
message: resultData.msg || 'Failed to get convert result'
}, 200, request, env);
}
return jsonResponse({
success: true,
service: 'doc2x',
...resultData.data,
}, 200, request, env);
} catch (error) {
console.error('Doc2X convert result error:', error);
return jsonResponse({ error: error.message || 'Failed to get convert result' }, 500, request, env);
}
}
// ==================== 工具函数 ====================
/**
* 获取 API Token
* 优先级:请求头 > 环境变量
* @param {Request} request - 请求对象
* @param {Object} env - 环境变量
* @param {string} service - 服务名称 ('MINERU' 或 'DOC2X')
* @returns {string|null} - Token 或 null
*/
function getToken(request, env, service) {
// 1. 优先从请求头获取
const headerKey = service === 'MINERU' ? 'X-MinerU-Key' : 'X-Doc2X-Key';
let token = request.headers.get(headerKey);
const tokenSource = token ? 'header' : null;
// 2. 如果请求头没有,从环境变量获取
if (!token) {
const envKey = service === 'MINERU' ? 'MINERU_API_TOKEN' : 'DOC2X_API_TOKEN';
token = env[envKey];
if (token) {
const tokenPreview = token.length > 12
? `${token.substring(0, 6)}...${token.substring(token.length - 6)}`
: token;
console.log(`[${service}] Token: Using from environment variable (${tokenPreview})`);
}
} else {
const tokenPreview = token.length > 12
? `${token.substring(0, 6)}...${token.substring(token.length - 6)}`
: token;
console.log(`[${service}] Token: Using from request header (${headerKey}) (${tokenPreview})`);
}
// 3. 清理可能误加的 Bearer 前缀
if (token) {
const originalToken = token;
token = token.replace(/^Bearer\s+/i, '').trim();
if (originalToken !== token) {
console.log(`[${service}] Token: Cleaned Bearer prefix`);
}
}
if (!token) {
console.warn(`[${service}] Token: No token found in header or environment`);
}
return token || null;
}
function handleCORS(request, env) {
const origin = request.headers.get('Origin') || '*';
let allowedOrigin = '*';
if (env.ALLOWED_ORIGINS) {
const allowedOrigins = env.ALLOWED_ORIGINS.split(',').map(o => o.trim());
if (allowedOrigins.includes(origin)) {
allowedOrigin = origin;
} else {
return new Response(null, { status: 403 });
}
} else {
allowedOrigin = origin;
}
return new Response(null, {
headers: {
'Access-Control-Allow-Origin': allowedOrigin,
'Access-Control-Allow-Methods': 'GET, HEAD, POST, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type, Range, X-Auth-Key, X-MinerU-Key, X-Doc2X-Key',
'Access-Control-Expose-Headers': 'Content-Length, Content-Range, Accept-Ranges',
'Access-Control-Max-Age': '86400',
},
});
}
function jsonResponse(data, status, request, env) {
const origin = request.headers.get('Origin') || '*';
let allowedOrigin = '*';
if (env.ALLOWED_ORIGINS) {
const allowedOrigins = env.ALLOWED_ORIGINS.split(',').map(o => o.trim());
if (allowedOrigins.includes(origin)) {
allowedOrigin = origin;
}
} else {
allowedOrigin = origin;
}
return new Response(JSON.stringify(data), {
status,
headers: {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': allowedOrigin,
'Access-Control-Allow-Methods': 'GET, POST, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type, X-Auth-Key, X-MinerU-Key, X-Doc2X-Key',
},
});
}
/**
* 代理下载远端 ZIP 并返回,附带 CORS 头部
* 支持 Range 请求以实现分片下载
*/
async function handleProxyZip(zipUrl, request, env) {
try {
if (!zipUrl) {
return jsonResponse({ error: 'url is required' }, 400, request, env);
}
// 构建上游请求头,透传 Range 头(支持分片下载)
const upstreamHeaders = {};
const rangeHeader = request.headers.get('Range');
if (rangeHeader) {
upstreamHeaders['Range'] = rangeHeader;
console.log('[Worker] Proxying Range request:', rangeHeader);
}
const resp = await fetch(zipUrl, {
method: request.method || 'GET',
headers: upstreamHeaders,
redirect: 'follow'
});
if (!resp.ok && resp.status !== 206) {
return jsonResponse({ error: `Upstream fetch failed: ${resp.status} ${resp.statusText}` }, 502, request, env);
}
// 计算允许的 Origin与 jsonResponse 同步)
const origin = request.headers.get('Origin') || '*';
let allowedOrigin = '*';
if (env.ALLOWED_ORIGINS) {
const allowedOrigins = env.ALLOWED_ORIGINS.split(',').map(o => o.trim());
if (allowedOrigins.includes(origin)) {
allowedOrigin = origin;
}
} else {
allowedOrigin = origin;
}
// 构建响应头
const responseHeaders = {
'Content-Type': resp.headers.get('Content-Type') || 'application/zip',
'Access-Control-Allow-Origin': allowedOrigin,
'Access-Control-Allow-Methods': 'GET, HEAD, POST, OPTIONS',
'Access-Control-Allow-Headers': 'Content-Type, Range, X-Auth-Key, X-MinerU-Key, X-Doc2X-Key',
'Access-Control-Expose-Headers': 'Content-Length, Content-Range, Accept-Ranges',
'Cache-Control': 'no-store',
};
// 透传 Range 相关头部
if (resp.headers.has('Content-Length')) {
responseHeaders['Content-Length'] = resp.headers.get('Content-Length');
}
if (resp.headers.has('Content-Range')) {
responseHeaders['Content-Range'] = resp.headers.get('Content-Range');
}
if (resp.headers.has('Accept-Ranges')) {
responseHeaders['Accept-Ranges'] = resp.headers.get('Accept-Ranges');
}
return new Response(resp.body, {
status: resp.status, // 保持原始状态码200 或 206
headers: responseHeaders,
});
} catch (e) {
console.error('Proxy zip error:', e);
return jsonResponse({ error: e.message || 'Proxy failed' }, 500, request, env);
}
}