deerflow2/backend/packages/harness/deerflow/utils/file_conversion.py

48 lines
1.2 KiB
Python

"""File conversion utilities.
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
No FastAPI or HTTP dependencies — pure utility functions.
"""
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
# File extensions that should be converted to markdown
CONVERTIBLE_EXTENSIONS = {
".pdf",
".ppt",
".pptx",
".xls",
".xlsx",
".doc",
".docx",
}
async def convert_file_to_markdown(file_path: Path) -> Path | None:
"""Convert a file to markdown using markitdown.
Args:
file_path: Path to the file to convert.
Returns:
Path to the markdown file if conversion was successful, None otherwise.
"""
try:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(file_path))
# Save as .md file with same name
md_path = file_path.with_suffix(".md")
md_path.write_text(result.text_content, encoding="utf-8")
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
return md_path
except Exception as e:
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
return None