deerflow2/frontend/src/core/utils/markdown-download/converter.ts

706 lines
18 KiB
TypeScript

import {
Document as DocxDocument,
Packer,
Paragraph,
TextRun,
HeadingLevel,
ImageRun,
type ParagraphChild,
} from "docx";
import { marked } from "marked";
// ============================================================================
// Types
// ============================================================================
/**
* Markdown Token 类型(简化版)
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any
type MarkdownToken = any;
/**
* PDF 转换选项
*/
export interface PdfOptions {
/**
* 页边距 [上, 右, 下, 左],单位 mm
* @default [15, 15, 15, 15]
*/
margin?: [number, number, number, number];
/**
* 页面格式
* @default "a4"
*/
format?: "a3" | "a4" | "a5" | "letter" | "legal";
/**
* 页面方向
* @default "portrait"
*/
orientation?: "portrait" | "landscape";
/**
* 缩放比例
* @default 2
*/
scale?: number;
}
/**
* DOCX 转换选项
*/
export interface DocxOptions {
/**
* 代码块字体
* @default "Courier New"
*/
codeFont?: string;
/**
* 代码块字号(半磅)
* @default 22 (11pt)
*/
codeFontSize?: number;
/**
* 解析 Markdown 里的资源路径(如图片相对路径)
*/
resolveAssetUrl?: (rawPath: string) => string | null | Promise<string | null>;
}
// ============================================================================
// DOCX Converter
// ============================================================================
/**
* 将 Markdown 内容转换为 DOCX 文件并下载
*
* @param markdown - Markdown 文本内容
* @param filename - 文件名(不含扩展名,或包含 .md 扩展名)
* @param options - 转换选项
*
* @example
* ```ts
* await downloadMarkdownAsDocx("# Hello World", "document");
* ```
*/
export async function downloadMarkdownAsDocx(
markdown: string,
filename: string,
options: DocxOptions = {},
): Promise<void> {
const {
codeFont = "Courier New",
codeFontSize = 22,
resolveAssetUrl,
} = options;
const tokens = marked.lexer(markdown);
const children = await parseTokensToDocx(tokens, {
codeFont,
codeFontSize,
resolveAssetUrl,
});
const doc = new DocxDocument({
sections: [{ children }],
});
const blob = await Packer.toBlob(doc);
downloadBlob(blob, normalizeFilename(filename, ".docx"));
}
// ============================================================================
// PDF Converter
// ============================================================================
/**
* 将 Markdown 内容转换为 PDF 文件并下载
*
* @param markdown - Markdown 文本内容
* @param filename - 文件名(不含扩展名,或包含 .md 扩展名)
* @param options - 转换选项
*
* @example
* ```ts
* await downloadMarkdownAsPdf("# Hello World", "document");
* ```
*/
export async function downloadMarkdownAsPdf(
markdown: string,
filename: string,
options: PdfOptions & {
resolveAssetUrl?: (
rawPath: string,
) => string | null | Promise<string | null>;
} = {},
): Promise<void> {
const html2pdf = await loadHtml2Pdf();
const {
margin = [15, 15, 15, 15],
format = "a4",
orientation = "portrait",
scale = 2,
resolveAssetUrl,
} = options;
const normalizedMarkdown = await rewriteMarkdownImageSources(
markdown,
resolveAssetUrl,
);
// 解析 Markdown 为 HTML
const htmlContent = await marked.parse(normalizedMarkdown);
// 创建容器并应用样式
const container = createStyledContainer(htmlContent);
// 配置 html2pdf
const opt = {
margin,
filename: normalizeFilename(filename, ".pdf"),
image: { type: "jpeg" as const, quality: 0.98 },
html2canvas: {
scale,
useCORS: true,
logging: false,
onclone: fixColorsForHtml2Canvas,
},
jsPDF: { unit: "mm" as const, format, orientation },
};
await html2pdf().set(opt).from(container).save();
}
// ============================================================================
// Internal Utilities
// ============================================================================
/**
* 动态加载 html2pdf.js
*/
// eslint-disable-next-line @typescript-eslint/no-unsafe-function-type
async function loadHtml2Pdf(): Promise<Function> {
const html2pdf = await import("html2pdf.js");
return html2pdf.default;
}
/**
* 创建带样式的 HTML 容器
*/
function createStyledContainer(htmlContent: string): HTMLDivElement {
const container = document.createElement("div");
container.innerHTML = htmlContent;
// 容器基础样式
container.style.cssText = `
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
font-size: 14px;
line-height: 1.6;
padding: 20px;
max-width: 800px;
color: #333333;
background-color: #ffffff;
`;
// 应用元素样式
applyElementStyles(container);
return container;
}
/**
* 应用元素样式
*/
function applyElementStyles(container: HTMLElement): void {
// 标题
container.querySelectorAll("h1, h2, h3, h4, h5, h6").forEach((h) => {
const el = h as HTMLElement;
el.style.marginTop = "1.5em";
el.style.marginBottom = "0.5em";
el.style.fontWeight = "600";
el.style.color = "#1a1a1a";
});
// 段落
container.querySelectorAll("p").forEach((p) => {
(p as HTMLElement).style.marginBottom = "1em";
});
// 代码块
container.querySelectorAll("pre, code").forEach((code) => {
const el = code as HTMLElement;
el.style.fontFamily = "'SF Mono', 'Fira Code', Consolas, monospace";
el.style.backgroundColor = "#f5f5f5";
el.style.color = "#333333";
el.style.fontSize = "13px";
if (code.tagName === "PRE") {
el.style.padding = "12px";
el.style.borderRadius = "6px";
el.style.overflow = "auto";
} else {
el.style.padding = "2px 4px";
el.style.borderRadius = "3px";
}
});
// 列表
container.querySelectorAll("ul, ol").forEach((list) => {
const el = list as HTMLElement;
el.style.marginBottom = "1em";
el.style.paddingLeft = "2em";
});
// 引用块
container.querySelectorAll("blockquote").forEach((bq) => {
const el = bq as HTMLElement;
el.style.borderLeft = "4px solid #dddddd";
el.style.marginLeft = "0";
el.style.paddingLeft = "16px";
el.style.color = "#666666";
});
// 表格
container.querySelectorAll("table").forEach((table) => {
const el = table as HTMLElement;
el.style.borderCollapse = "collapse";
el.style.width = "100%";
el.style.marginBottom = "1em";
});
container.querySelectorAll("th, td").forEach((cell) => {
const el = cell as HTMLElement;
el.style.border = "1px solid #dddddd";
el.style.padding = "8px";
});
// 链接
container.querySelectorAll("a").forEach((link) => {
const el = link as HTMLElement;
el.style.color = "#0066cc";
el.style.textDecoration = "underline";
});
// 分割线
container.querySelectorAll("hr").forEach((hr) => {
const el = hr as HTMLElement;
el.style.border = "none";
el.style.borderTop = "1px solid #dddddd";
el.style.margin = "2em 0";
});
}
/**
* 修复 html2canvas 不支持的颜色函数
*/
function fixColorsForHtml2Canvas(clonedDoc: Document): void {
// 移除外部样式表(可能包含 lab、oklab 等不支持的颜色)
clonedDoc
.querySelectorAll<
HTMLStyleElement | HTMLLinkElement
>('link[rel="stylesheet"], style')
.forEach((sheet) => sheet.remove());
// 重置所有元素的颜色属性为安全值
clonedDoc.querySelectorAll<HTMLElement>("*").forEach((el) => {
const props = [
"color",
"background-color",
"border-color",
"border-top-color",
"border-bottom-color",
"border-left-color",
"border-right-color",
"outline-color",
"text-decoration-color",
"caret-color",
"column-rule-color",
"accent-color",
"fill",
"stroke",
];
props.forEach((prop) => el.style.removeProperty(prop));
el.style.color = "#333333";
el.style.backgroundColor = "transparent";
});
// 设置 body 背景
const body = clonedDoc.body;
body.style.color = "#333333";
body.style.backgroundColor = "#ffffff";
}
/**
* 解析 Markdown Token 为 DOCX Paragraph
*/
async function parseTokensToDocx(
tokens: MarkdownToken[],
options: Required<Pick<DocxOptions, "codeFont" | "codeFontSize">> &
Pick<DocxOptions, "resolveAssetUrl">,
): Promise<Paragraph[]> {
const paragraphs: Paragraph[] = [];
for (const token of tokens) {
switch (token.type) {
case "heading": {
const runs = await parseInlineTokens(token.tokens ?? [], options);
paragraphs.push(
new Paragraph({
children: runs,
heading: getHeadingLevel(token.depth),
spacing: { before: 240, after: 120 },
}),
);
break;
}
case "paragraph": {
const runs = await parseInlineTokens(token.tokens ?? [], options);
paragraphs.push(
new Paragraph({
children: runs.length > 0 ? runs : [new TextRun("")],
spacing: { after: 200 },
}),
);
break;
}
case "code": {
const lines = token.text.split("\n");
lines.forEach((line: string) => {
paragraphs.push(
new Paragraph({
children: [
new TextRun({
text: line.length > 0 ? line : " ",
font: options.codeFont,
size: options.codeFontSize,
}),
],
shading: { fill: "F5F5F5" },
}),
);
});
paragraphs.push(new Paragraph({ children: [] }));
break;
}
case "list": {
for (const item of token.items ?? []) {
const runs = await parseInlineTokens(
item.tokens?.[0]?.tokens ?? [],
options,
);
paragraphs.push(
new Paragraph({
children: runs.length > 0 ? runs : [new TextRun("")],
bullet: { level: 0 },
spacing: { after: 80 },
}),
);
}
break;
}
case "blockquote": {
const runs = await parseInlineTokens(
token.tokens?.[0]?.tokens ?? [],
options,
);
paragraphs.push(
new Paragraph({
children: runs.length > 0 ? runs : [new TextRun("")],
indent: { left: 720 },
border: { left: { style: "single", size: 12, color: "CCCCCC" } },
spacing: { after: 200 },
}),
);
break;
}
case "hr": {
paragraphs.push(
new Paragraph({
children: [new TextRun({ text: "─".repeat(50), color: "CCCCCC" })],
spacing: { before: 200, after: 200 },
}),
);
break;
}
case "space": {
paragraphs.push(new Paragraph({ children: [] }));
break;
}
case "image": {
const imageRun = await createImageRunFromToken(token, options);
if (imageRun) {
paragraphs.push(
new Paragraph({
children: [imageRun],
spacing: { after: 200 },
}),
);
}
break;
}
}
}
return paragraphs;
}
/**
* 解析行内 Token 为 TextRun
*/
async function parseInlineTokens(
tokens: MarkdownToken[],
options: Required<Pick<DocxOptions, "codeFont" | "codeFontSize">> &
Pick<DocxOptions, "resolveAssetUrl">,
): Promise<ParagraphChild[]> {
const runs: ParagraphChild[] = [];
for (const token of tokens) {
switch (token.type) {
case "text":
runs.push(new TextRun(token.raw ?? token.text ?? ""));
break;
case "strong":
runs.push(new TextRun({ text: token.text, bold: true }));
break;
case "em":
runs.push(new TextRun({ text: token.text, italics: true }));
break;
case "codespan":
runs.push(
new TextRun({
text: token.text,
font: options.codeFont,
shading: { fill: "F0F0F0" },
}),
);
break;
case "link":
runs.push(
new TextRun({
text: token.text,
color: "0066CC",
underline: {},
}),
);
break;
case "br":
runs.push(new TextRun({ text: "", break: 1 }));
break;
case "image": {
const imageRun = await createImageRunFromToken(token, options);
if (imageRun) {
runs.push(imageRun);
}
break;
}
default:
runs.push(new TextRun(token.raw ?? ""));
}
}
return runs;
}
async function createImageRunFromToken(
token: MarkdownToken,
options: Pick<DocxOptions, "resolveAssetUrl">,
): Promise<ImageRun | null> {
const rawHref = String(token?.href ?? token?.text ?? "").trim();
if (!rawHref) return null;
const resolvedUrl = await resolveAssetReference(
rawHref,
options.resolveAssetUrl,
);
if (!resolvedUrl || !isRenderableImageUrl(resolvedUrl)) {
return null;
}
try {
const response = await fetch(resolvedUrl);
if (!response.ok) {
return null;
}
const blob = await response.blob();
const imageType = getDocxImageType(blob.type, resolvedUrl);
if (!imageType) {
return null;
}
const bytes = new Uint8Array(await blob.arrayBuffer());
const { width, height } = await getImageDimensions(blob);
const maxWidth = 560;
const scale = width > maxWidth ? maxWidth / width : 1;
return new ImageRun({
data: bytes,
type: imageType,
transformation: {
width: Math.max(1, Math.round(width * scale)),
height: Math.max(1, Math.round(height * scale)),
},
});
} catch {
return null;
}
}
async function getImageDimensions(
blob: Blob,
): Promise<{ width: number; height: number }> {
return await new Promise((resolve) => {
const url = URL.createObjectURL(blob);
const img = new Image();
img.onload = () => {
const width = img.naturalWidth || 1;
const height = img.naturalHeight || 1;
URL.revokeObjectURL(url);
resolve({ width, height });
};
img.onerror = () => {
URL.revokeObjectURL(url);
resolve({ width: 600, height: 400 });
};
img.src = url;
});
}
async function rewriteMarkdownImageSources(
markdown: string,
resolveAssetUrl?: (rawPath: string) => string | null | Promise<string | null>,
): Promise<string> {
if (!resolveAssetUrl) {
return markdown;
}
let rewritten = markdown;
const markdownMatches = [...rewritten.matchAll(/!\[([^\]]*)\]\(([^)]+)\)/g)];
for (const match of markdownMatches) {
const alt = match[1] ?? "";
const rawTarget = match[2]?.trim() ?? "";
const resolved = await resolveAssetReference(rawTarget, resolveAssetUrl);
if (!resolved || resolved === rawTarget) continue;
rewritten = rewritten.replace(match[0], `![${alt}](${resolved})`);
}
const htmlMatches = [
...rewritten.matchAll(/(<img\b[^>]*\bsrc\s*=\s*)(["'])([^"']+)\2/gi),
];
for (const match of htmlMatches) {
const rawTarget = match[3]?.trim() ?? "";
const resolved = await resolveAssetReference(rawTarget, resolveAssetUrl);
if (!resolved || resolved === rawTarget) continue;
rewritten = rewritten.replace(
match[0],
`${match[1]}${match[2]}${resolved}${match[2]}`,
);
}
return rewritten;
}
async function resolveAssetReference(
rawPath: string,
resolveAssetUrl?: (rawPath: string) => string | null | Promise<string | null>,
): Promise<string | null> {
const normalized = normalizeReference(rawPath);
if (!normalized) return null;
if (isExternalReference(normalized)) return normalized;
if (!resolveAssetUrl) return normalized;
return (await resolveAssetUrl(normalized)) ?? normalized;
}
function normalizeReference(ref: string): string {
const trimmed = ref.trim().replace(/^<|>$/g, "");
return trimmed.split(/[ \t]/)[0] ?? "";
}
function isExternalReference(ref: string): boolean {
return (
!ref ||
ref.startsWith("#") ||
ref.startsWith("//") ||
ref.startsWith("data:") ||
ref.startsWith("blob:") ||
/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(ref)
);
}
function isRenderableImageUrl(url: string): boolean {
if (url.startsWith("data:image/")) return true;
if (/\.(png|jpe?g|gif|webp|bmp|ico|avif|tiff?)([?#].*)?$/i.test(url))
return true;
if (/^https?:\/\//i.test(url)) return true;
if (url.startsWith("/")) return true;
return false;
}
function getDocxImageType(
mimeType: string,
src: string,
): "png" | "jpg" | "gif" | "bmp" {
const mime = mimeType.toLowerCase();
if (mime.includes("png")) return "png";
if (mime.includes("jpeg") || mime.includes("jpg")) return "jpg";
if (mime.includes("gif")) return "gif";
if (mime.includes("bmp")) return "bmp";
const lower = src.toLowerCase();
if (lower.includes(".png")) return "png";
if (lower.includes(".jpg") || lower.includes(".jpeg")) return "jpg";
if (lower.includes(".gif")) return "gif";
if (lower.includes(".bmp")) return "bmp";
return "png";
}
/**
* 获取标题级别
*/
function getHeadingLevel(
depth: number,
): (typeof HeadingLevel)[keyof typeof HeadingLevel] | undefined {
const levels = [
HeadingLevel.HEADING_1,
HeadingLevel.HEADING_2,
HeadingLevel.HEADING_3,
HeadingLevel.HEADING_4,
HeadingLevel.HEADING_5,
HeadingLevel.HEADING_6,
];
return levels[depth - 1];
}
/**
* 规范化文件名
*/
function normalizeFilename(filename: string, extension: string): string {
return filename.replace(/\.md$/i, "") + extension;
}
/**
* 触发 Blob 下载
*/
function downloadBlob(blob: Blob, filename: string): void {
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
}