// js/processing/markdown_processor_enhanced.js // Enhanced markdown processor with improved robustness for formulas and complex content (function MarkdownProcessorEnhanced(global) { // Enhanced cache with versioning and size limits const renderCache = new Map(); const MAX_CACHE_SIZE = 1000; const CACHE_VERSION = '2.0'; // Performance metrics tracking const metrics = { cacheHits: 0, cacheMisses: 0, totalRenders: 0, avgRenderTime: 0, formulaErrors: 0, formulaSuccesses: 0 }; const FORMULA_BLOCK_HINTS = [ /\r|\n/, // explicit line breaks /\\\\/, // LaTeX newline command /\\tag\b/, // equation tags /\\label\b/, /\\eqref\b/, /\\display(?:style|limits)\b/, /\\begin\{(?:align\*?|aligned|flalign\*?|gather\*?|multline\*?|split|cases|array|pmatrix|bmatrix|vmatrix|Vmatrix|matrix|smallmatrix)\}/, /\\end\{(?:align\*?|aligned|flalign\*?|gather\*?|multline\*?|split|cases|array|pmatrix|bmatrix|vmatrix|Vmatrix|matrix|smallmatrix)\}/ ]; /** * Enhanced markdown preprocessing with robust formula and image handling * @param {string} md - Input markdown text * @param {Array} images - Image objects with name/id and data * @returns {string} Processed markdown text */ function safeMarkdownEnhanced(md, images) { performance.mark('safeMarkdown-enhanced-start'); if (!md || typeof md !== 'string') { performance.mark('safeMarkdown-enhanced-end'); performance.measure('safeMarkdown-enhanced', 'safeMarkdown-enhanced-start', 'safeMarkdown-enhanced-end'); return ''; } // Build robust image mapping with multiple fallback keys const imgMap = new Map(); if (Array.isArray(images)) { images.forEach((img, idx) => { if (!img || !img.data) return; const keys = new Set(); // Add various possible keys if (img.name) keys.add(img.name); if (img.id) keys.add(img.id); keys.add(`img-${idx}.jpeg.png`); keys.add(`img-${idx + 1}.jpeg.png`); // Add with 'images/' prefix [...keys].forEach(k => keys.add('images/' + k)); const src = img.data.startsWith('data:') ? img.data : `data:image/png;base64,${img.data}`; keys.forEach(k => imgMap.set(k, src)); }); } // Enhanced image replacement with better error handling // 支持多种格式: // - images/page3_img1.png (Local PDF) // - images/img-1.jpeg.png (旧格式) // - page3_img1 (不带扩展名) // - 任意相对路径 md = md.replace(/!\[([^\]]*)\]\(([^)]+)\)/g, (match, alt, path) => { // 跳过外部链接和已经是 base64 的图片 const p = String(path).trim(); if (/^(https?:|data:|\/\/)/i.test(p)) { return match; } // 去除查询参数和锚点 const clean = p.split('?')[0].split('#')[0]; // 尝试多种可能的 key const candidates = [ clean, // 原始路径 clean.replace(/^images\//, ''), // 去掉 images/ 前缀 clean.replace(/\.png$/i, ''), // 去掉 .png 后缀 clean.replace(/^images\//, '').replace(/\.png$/i, ''), // 两者都去掉 'images/' + clean, // 添加 images/ 前缀 clean.split('/').pop(), // 只取文件名 'images/' + clean.split('/').pop() // 文件名 + images/ 前缀 ]; // 尝试所有候选 key for (const key of candidates) { if (imgMap.has(key)) { return `![${alt || ''}](${imgMap.get(key)})`; } } // 未找到图片,输出警告 console.warn(`[MarkdownProcessorEnhanced] Image not found: ${path}, tried:`, candidates.slice(0, 5)); return match; // 保持原样 }); // Enhanced custom syntax processing with better error handling md = processCustomSyntax(md); performance.mark('safeMarkdown-enhanced-end'); performance.measure('safeMarkdown-enhanced', 'safeMarkdown-enhanced-start', 'safeMarkdown-enhanced-end'); return md; } /** * Process custom syntax (subscripts, superscripts) with enhanced robustness * @param {string} md - Markdown text * @returns {string} Processed markdown */ function processCustomSyntax(md) { // Enhanced regex patterns with better boundary detection const patterns = [ // Base with superscript: ${base}^{sup}$ { regex: /\$\{\s*([^}]*?)\s*\}\^\{([^}]*?)\}\$/g, replacement: (_, base, sup) => { const cleanBase = (base || '').trim(); const cleanSup = (sup || '').trim(); return cleanBase ? `${escapeHtml(cleanBase)}${escapeHtml(cleanSup)}` : `${escapeHtml(cleanSup)}`; } }, // Base with subscript: ${base}_{sub}$ { regex: /\$\{\s*([^}]*?)\s*\}_\{([^}]*?)\}\$/g, replacement: (_, base, sub) => { const cleanBase = (base || '').trim(); const cleanSub = (sub || '').trim(); return cleanBase ? `${escapeHtml(cleanBase)}${escapeHtml(cleanSub)}` : `${escapeHtml(cleanSub)}`; } }, // Empty base superscript: ${}^{sup}$ { regex: /\$\{\s*\}\^\{([^}]*?)\}\$/g, replacement: (_, sup) => `${escapeHtml((sup || '').trim())}` }, // Empty base subscript: ${}_{sub}$ { regex: /\$\{\s*\}_\{([^}]*?)\}\$/g, replacement: (_, sub) => `${escapeHtml((sub || '').trim())}` }, // Simple superscript: ${content}$ { regex: /\$\{\s*([^}]*?)\s*\}\$/g, replacement: (_, content) => `${escapeHtml((content || '').trim())}` } ]; patterns.forEach(({ regex, replacement }) => { try { md = md.replace(regex, replacement); } catch (error) { console.warn(`[MarkdownProcessorEnhanced] Custom syntax processing error:`, error); } }); return md; } /** * Enhanced KaTeX rendering with improved error handling and formula analysis * @param {string} md - Preprocessed markdown text * @param {Function} customRenderer - Custom marked renderer * @returns {string} Rendered HTML */ function renderWithKatexEnhanced(md, customRenderer) { performance.mark('renderKatex-enhanced-start'); metrics.totalRenders++; const cacheKey = `${CACHE_VERSION}:${md}`; // Enhanced cache check if (renderCache.has(cacheKey)) { metrics.cacheHits++; performance.mark('renderKatex-enhanced-end'); performance.measure('renderWithKatex-enhanced (cache)', 'renderKatex-enhanced-start', 'renderKatex-enhanced-end'); return renderCache.get(cacheKey); } metrics.cacheMisses++; // Protected content extraction (code blocks, existing HTML) const protectedContent = new Map(); let protectedCounter = 0; // **IMPORTANT: Process formulas BEFORE protecting code blocks** // This prevents code protection from interfering with formula delimiters const formulaProtectedCounter = { value: 0 }; md = processFormulasEnhanced(md, protectedContent, formulaProtectedCounter); // Now protect code blocks and HTML (using updated counter) protectedCounter = formulaProtectedCounter.value; md = protectContent(md, protectedContent, protectedCounter); // Render remaining markdown let result; try { const markedOptions = customRenderer ? { renderer: customRenderer } : {}; result = marked.parse(md, markedOptions); } catch (error) { console.error(`[MarkdownProcessorEnhanced] Marked parsing error:`, error); result = `
Markdown parsing failed: ${escapeHtml(error.message)}
`; } // Restore protected content result = restoreContent(result, protectedContent); // Cache management with size limit if (renderCache.size >= MAX_CACHE_SIZE) { const firstKey = renderCache.keys().next().value; renderCache.delete(firstKey); } renderCache.set(cacheKey, result); // Update performance metrics const renderTime = performance.now(); metrics.avgRenderTime = (metrics.avgRenderTime * (metrics.totalRenders - 1) + renderTime) / metrics.totalRenders; performance.mark('renderKatex-enhanced-end'); performance.measure('renderWithKatex-enhanced', 'renderKatex-enhanced-start', 'renderKatex-enhanced-end'); return result; } /** * Protect content that should not be processed (code blocks, HTML) * @param {string} md - Markdown text * @param {Map} protectedContent - Map to store protected content * @param {number} counter - Starting counter value * @returns {string} Markdown with protected content replaced by placeholders */ function protectContent(md, protectedContent, counter) { // Protect fenced code blocks (``` ... ```) md = md.replace(/```[\s\S]*?```/g, (match) => { const placeholder = `PBTOKEN${counter++}Z`; protectedContent.set(placeholder, match); return placeholder; }); // Protect inline code (`...`) md = md.replace(/`[^`\n]+?`/g, (match) => { const placeholder = `PBTOKEN${counter++}Z`; protectedContent.set(placeholder, match); return placeholder; }); // Protect only real HTML constructs to avoid eating math comparators like "<=" const htmlPatterns = [ //g, // HTML comments /]*?>/gi, // DOCTYPE /<\/?[A-Za-z][A-Za-z0-9-]*(\s+[^<>]*?)?>/g // opening/closing/self-closing tags ]; htmlPatterns.forEach((re) => { md = md.replace(re, (match) => { const placeholder = `PBTOKEN${counter++}Z`; protectedContent.set(placeholder, match); return placeholder; }); }); return md; } /** * Enhanced formula processing with better error handling and context analysis * @param {string} md - Markdown text * @param {Map} protectedContent - Map to store protected content * @param {Object} counterObj - Counter object with 'value' property * @returns {string} Processed markdown with formulas rendered and protected */ function processFormulasEnhanced(md, protectedContent, counterObj) { // Normalize math delimiters to avoid regex mismatches and nested '$' leakage function normalizeMathDelimiters(text) { if (typeof text !== 'string' || !text) return text; let s = text; // Convert encoded dollars to literal '$' s = s.replace(/&(?:#0*36|dollar);/gi, '$'); // Normalize fullwidth dollar to ASCII s = s.replace(/\uFF04/g, '$'); // Remove zero-width and combining marks immediately around '$' so `$̲` → `$` s = s.replace(/\$[\u200B-\u200D\uFEFF\u0300-\u036F]+/g, '$'); s = s.replace(/[\u200B-\u200D\uFEFF\u0300-\u036F]+\$/g, '$'); // **NEW: 修复 OCR 错误转义的 $ 符号** // 1. $\$ ... \$ ,$ → $ ... $ ,(移除尾部的 ,$) s = s.replace(/\$\\\$\s*([^\$]+?)\s*\\\$\s*,\s*\$/g, '$$$1$$ ,'); // 2. $\$ ... \$$ → $ ... $ (处理末尾多余的 $$,先处理这个避免被后面的规则误处理) s = s.replace(/\$\\\$\s*([^\$]+?)\s*\\\$\$/g, '$$$1$$'); // 3. $\$ ... \$ → $ ... $ s = s.replace(/\$\\\$\s*([^\$]+?)\s*\\\$/g, '$$$1$$'); // 4. \$...\$ → $...$ (完全转义的内联公式) s = s.replace(/\\\$([^\$\n]+?)\\\$/g, '$$$1$$'); return s; } md = normalizeMathDelimiters(md); // Helper function to protect rendered formulas function protectRenderedFormula(renderedHtml) { if (!renderedHtml || typeof renderedHtml !== 'string') return renderedHtml; // 如果渲染结果包含 HTML 标签,保护它 if (renderedHtml.includes('<')) { const placeholder = `PBTOKEN${counterObj.value++}Z`; protectedContent.set(placeholder, renderedHtml); return placeholder; } return renderedHtml; } // Process block formulas first ($$...$$) md = md.replace(/\$\$([\s\S]*?)\$\$/g, (match, content) => { const rendered = renderFormula(content.trim(), true, match); return protectRenderedFormula(rendered); }); // Process LaTeX-style block formulas (\[...\]) md = md.replace(/\\\[([\s\S]*?)\\\]/g, (match, content) => { const rendered = renderFormula(content.trim(), true, match); return protectRenderedFormula(rendered); }); // Process inline formulas ($...$) // 支持多行公式,但限制长度防止误匹配 md = md.replace(/\$([^\$]{1,2000}?)\$/g, (match, content) => { // 快速检查:如果内容是纯中文(没有任何数学符号),直接跳过 const trimmed = content.trim(); if (trimmed && /^[\u4e00-\u9fa5,、。;:!?""''()【】《》\s]+$/.test(trimmed)) { console.log(`[MarkdownProcessorEnhanced] Skipping pure Chinese inline: "${trimmed}"`); return match; // 保留原始 $...$ } // 如果包含多个段落(连续两个换行),可能是误匹配,跳过 if (/\n\s*\n/.test(content)) { console.log(`[MarkdownProcessorEnhanced] Skipping multi-paragraph match: "${trimmed.substring(0, 50)}..."`); return match; } const rendered = renderFormula(content.trim(), false, match); return protectRenderedFormula(rendered); }); // Process LaTeX-style inline formulas (\(...\)) md = md.replace(/\\\(([^)]*?)\\\)/g, (match, content) => { const rendered = renderFormula(content.trim(), false, match); return protectRenderedFormula(rendered); }); return md; } /** * Analyze formula structure to determine appropriate display mode. * @param {string} content - Raw formula content. * @param {boolean} displayHint - Preferred display mode from the matcher. * @returns {{ text: string, displayMode: boolean, forcedByHint: boolean, forcedByStructure: boolean }} */ function analyzeFormulaLayout(content, displayHint) { const normalized = typeof content === 'string' ? content.trim() : ''; if (!normalized) { return { text: '', displayMode: !!displayHint, forcedByHint: !!displayHint, forcedByStructure: false }; } let displayMode = !!displayHint; let forcedByStructure = false; if (!displayMode) { forcedByStructure = FORMULA_BLOCK_HINTS.some(pattern => pattern.test(normalized)); if (forcedByStructure) { displayMode = true; } } return { text: normalized, displayMode, forcedByHint: !!displayHint, forcedByStructure }; } /** * Build an accessible fallback block when KaTeX rendering fails. * @param {string} content - Formula content. * @param {boolean} displayMode - Final display mode. * @param {Error|string} error - Rendering error. * @returns {string} HTML fallback snippet. */ function buildKatexFallback(content, displayMode, error) { const sanitized = escapeHtml(content || ''); const message = error && error.message ? error.message : (typeof error === 'string' ? error : ''); const errorInfo = message ? ` data-katex-error="${escapeHtml(message)}" title="Formula rendering failed: ${escapeHtml(message)}"` : ''; if (displayMode) { return `
${sanitized}
`; } return `${sanitized}`; } /** * Render individual formula with enhanced error handling * @param {string} content - Formula content * @param {boolean} displayModeHint - Whether to use display mode * @param {string} originalMatch - Original matched text for fallback * @returns {string} Rendered formula or fallback */ function renderFormula(content, displayModeHint, originalMatch) { // Decode a limited set of HTML entities that may leak into TeX inputs function htmlUnescape(text) { if (typeof text !== 'string' || text.length === 0) return ''; let s = text; // Fix corrupted entities like "&̲#39;" (ampersand followed by combining marks) s = s.replace(/&[\u0300-\u036F]+#/g, '&#'); // Named entities s = s.replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(/ /g, ' '); // Numeric entities (decimal and hex) s = s.replace(/&#(\d+);/g, (_, dec) => { const code = parseInt(dec, 10); return Number.isFinite(code) ? String.fromCharCode(code) : _; }); s = s.replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) => { const code = parseInt(hex, 16); return Number.isFinite(code) ? String.fromCharCode(code) : _; }); return s; } // Sanitize TeX: remove stray punctuation at edges, zero-width/combining chars, normalize common unicode symbols function sanitizeTeX(src) { let s = typeof src === 'string' ? src : ''; if (!s) return ''; // Decode HTML entities first, e.g. ' → ', & → & s = htmlUnescape(s); // remove zero-width, BOM and combining marks anywhere s = s.replace(/[\u200B-\u200D\uFEFF\u0300-\u036F]/g, ''); // strip private-use glyphs sometimes appearing as unknown symbols (e.g. '') s = s.replace(/[\uE000-\uF8FF]/g, ''); // also normalize stray combining marks immediately after '&' that break entities s = s.replace(/&[\u0300-\u036F]+/g, '&'); // trim leading/trailing CJK punctuation and quotes that accidentally wrapped TeX s = s.replace(/^[\s\u3000。,、;::""\((\))\[\]【】《》'''"–—-]+/, ''); s = s.replace(/[\s\u3000。,、;::""\((\))\[\]【】《》'''"–—-]+$/, ''); // **NEW: Remove trailing orphaned backslashes** (孤立的尾部反斜杠) // 移除末尾的单个反斜杠,除非它是有效的 LaTeX 命令的一部分 // 注意:\backslash 后面跟的反斜杠也要清理 s = s.replace(/\\backslash\s+\\\s*$/, '\\backslash'); // \backslash \ → \backslash s = s.replace(/\\\s*$/, ''); // 其他孤立的尾部反斜杠 // **NEW: Remove standalone backslashes not part of commands** // 如果整个字符串就是一个反斜杠,清空它 if (s === '\\' || /^\\+$/.test(s)) { return ''; } // **NEW: Clean up invalid patterns that can't be LaTeX** // 移除纯中文后跟反斜杠的无效模式(中文不应该出现在数学公式中,除非在 \text{} 里) // 注意:只清理纯中文的,不要清理包含有效 LaTeX 命令的 if (/^[\u4e00-\u9fa5,、。;:\s]+$/.test(s)) { return ''; // 纯中文,不是有效的数学公式 } // **NEW: 修复常见的 OCR 错误** // \backslash \operatorname{vec} → \vec s = s.replace(/\\backslash\s+\\operatorname\{vec\}/g, '\\vec'); // \backslash \operatorname{sum} → \sum s = s.replace(/\\backslash\s+\\operatorname\{sum\}/g, '\\sum'); // \backslash \operatorname{prod} → \prod s = s.replace(/\\backslash\s+\\operatorname\{prod\}/g, '\\prod'); // 修复下标中的空格: x \_1 → x_1, x \_n → x_n s = s.replace(/\s+\\_/g, '_'); // 修复下标中的 {-} 错误: x_{-} i → x_i s = s.replace(/\{-\}\s*/g, ''); s = s.replace(/_\{-\s+([^\}]+)\}/g, '_{$1}'); // collapse excessive inner spaces s = s.replace(/\s{2,}/g, ' '); // If trailing delimiter for \right was stripped by cleanup, add default ')' if (/\\right\s*$/.test(s)) { let close = ')'; try { const re = /\\left\s*([\(\[\{])/g; let m; while ((m = re.exec(s)) !== null) { const ch = m[1]; close = ch === '(' ? ')' : ch === '[' ? ']' : '}'; } } catch (_) { /* ignore */ } s = s.replace(/\\right\s*$/, `\\right${close}`); } // Normalize degree with unit inside \mathrm{...}: \mathrm{ ^\circ C } → ^{\circ}\mathrm{C} s = s.replace(/\\mathrm\{\s*(?:\\;|\s)*\^\s*\{?\s*\\?circ\s*\}?\s*([A-Za-z])\s*\}/g, '^{\\circ}\\mathrm{$1}'); // Replace unsupported Unicode triangles with math macros s = s.replace(/▲/g, '\\blacktriangle').replace(/△/g, '\\triangle'); // Normalize some common unicode math symbols to TeX s = s.replace(/≠/g, '\\ne'); s = s.replace(/±/g, '\\pm'); s = s.replace(/∞/g, '\\infty'); return s.trim(); } const cleaned = sanitizeTeX(content); const analysis = analyzeFormulaLayout(cleaned, displayModeHint); let tex = analysis.text; // 如果清理后内容为空,说明这不是有效的数学公式 // 返回原始匹配文本(不渲染),避免吞掉内容 if (!tex) { console.log(`[MarkdownProcessorEnhanced] Skipping invalid formula: "${content}" (cleaned to empty)`); return originalMatch || ''; } // Guard against obviously incomplete or non-TeX inputs try { // **NEW: 检测只包含 \begin{...} 或只包含 \end{...} 的空环境** if (/^\s*\\begin\{[a-zA-Z*]+\}\s*$/.test(tex) || /^\s*\\end\{[a-zA-Z*]+\}\s*$/.test(tex)) { console.log(`[MarkdownProcessorEnhanced] Skipping empty environment: "${tex}"`); return ''; // 返回空字符串,不显示 } // Incomplete \begin{...} without matching \end{...} const beginMatch = tex.match(/\\begin\{([a-zA-Z*]+)\}/); if (beginMatch) { const env = beginMatch[1]; const endRe = new RegExp('\\\\end\\{' + env.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&') + '\\}'); if (!endRe.test(tex)) { // Do not attempt to render; return accessible fallback return buildKatexFallback(tex, true, 'Incomplete environment: \\begin{' + env + '} ...'); } } // Lone \end{...} without a preceding \begin{...} if (/^\s*\\end\{[a-zA-Z*]+\}\s*$/.test(tex)) { return buildKatexFallback(tex, analysis.displayMode, 'Orphaned \\end{...}'); } // Strings that look like prior KaTeX error messages or HTML entities only if (/^&#?\w/.test(tex) && / in math mode /.test(tex)) { return buildKatexFallback(tex, analysis.displayMode, 'Skipped non-TeX error text'); } // If the supposed TeX contains HTML tags, skip rendering (likely mis-detected) // 但允许数学比较符号 < 和 > (如 <0.001, x>5) // 只检测明显的 HTML 模式:${rendered} ` : `${rendered}`; return wrapper; } catch (error) { metrics.formulaErrors++; console.warn(`[MarkdownProcessorEnhanced] KaTeX rendering failed for: "${tex}"`, error); return buildKatexFallback(tex, analysis.displayMode, error); } } /** * Restore protected content * @param {string} html - HTML with placeholders * @param {Map} protectedContent - Map of protected content * @returns {string} HTML with content restored */ function restoreContent(html, protectedContent) { protectedContent.forEach((content, placeholder) => { html = html.replace(placeholder, content); }); return html; } /** * Escape HTML special characters * @param {string} text - Text to escape * @returns {string} Escaped text */ function escapeHtml(text) { if (typeof text !== 'string') return ''; const htmlEscapes = { '&': '&', '<': '<', '>': '>', '"': '"', "'": ''' }; return text.replace(/[&<>"']/g, (match) => htmlEscapes[match]); } /** * Get performance metrics * @returns {Object} Performance and error metrics */ function getMetrics() { return { ...metrics, cacheSize: renderCache.size, cacheHitRate: metrics.totalRenders > 0 ? (metrics.cacheHits / metrics.totalRenders * 100).toFixed(2) + '%' : '0%', formulaErrorRate: (metrics.formulaErrors + metrics.formulaSuccesses) > 0 ? (metrics.formulaErrors / (metrics.formulaErrors + metrics.formulaSuccesses) * 100).toFixed(2) + '%' : '0%' }; } /** * Clear cache and reset metrics */ function clearCache() { renderCache.clear(); Object.keys(metrics).forEach(key => { if (typeof metrics[key] === 'number') { metrics[key] = 0; } }); } /** * Test formula rendering capability * @param {string} formula - Formula to test * @param {boolean} displayMode - Display mode * @returns {Object} Test result */ function testFormula(formula, displayMode = false) { const startTime = performance.now(); try { const result = katex.renderToString(formula, { displayMode: displayMode, throwOnError: true }); return { success: true, result: result, renderTime: performance.now() - startTime, error: null }; } catch (error) { return { success: false, result: null, renderTime: performance.now() - startTime, error: error.message }; } } // Enhanced public interface global.MarkdownProcessorEnhanced = { // Core functions safeMarkdown: safeMarkdownEnhanced, renderWithKatexFailback: renderWithKatexEnhanced, // Utility functions processCustomSyntax: processCustomSyntax, renderFormula: renderFormula, escapeHtml: escapeHtml, // Management functions getMetrics: getMetrics, clearCache: clearCache, testFormula: testFormula, // Version info version: '2.0.0', compatibility: 'Backward compatible with MarkdownProcessor' }; // Backward compatibility if (!global.MarkdownProcessor) { global.MarkdownProcessor = { safeMarkdown: safeMarkdownEnhanced, renderWithKatexFailback: renderWithKatexEnhanced }; } })(window);