136 lines
4.8 KiB
Python
136 lines
4.8 KiB
Python
"""File conversion utilities.
|
|
|
|
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
|
|
No FastAPI or HTTP dependencies — pure utility functions.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# File extensions that should be converted to markdown
|
|
CONVERTIBLE_EXTENSIONS = {
|
|
".pdf",
|
|
".ppt",
|
|
".pptx",
|
|
".xls",
|
|
".xlsx",
|
|
".doc",
|
|
".docx",
|
|
}
|
|
|
|
|
|
async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
|
"""Convert a file to markdown using markitdown.
|
|
|
|
Args:
|
|
file_path: Path to the file to convert.
|
|
|
|
Returns:
|
|
Path to the markdown file if conversion was successful, None otherwise.
|
|
"""
|
|
try:
|
|
from markitdown import MarkItDown
|
|
|
|
md = MarkItDown()
|
|
result = md.convert(str(file_path))
|
|
|
|
# Save as .md file with same name
|
|
md_path = file_path.with_suffix(".md")
|
|
md_path.write_text(result.text_content, encoding="utf-8")
|
|
|
|
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
|
|
return md_path
|
|
except Exception as e:
|
|
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
|
return None
|
|
|
|
|
|
# Regex for bold-only lines that look like section headings.
|
|
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
|
|
# rather than # Markdown headings (because they use same font size as body text,
|
|
# distinguished only by bold+caps formatting).
|
|
#
|
|
# Pattern requires ALL of:
|
|
# 1. Entire line is a single **...** block (no surrounding prose)
|
|
# 2. Starts with a recognised structural keyword:
|
|
# - ITEM / PART / SECTION (with optional number/letter after)
|
|
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
|
|
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
|
|
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
|
|
#
|
|
# Chinese headings (第三节...) are already captured as standard # headings
|
|
# by pymupdf4llm, so they don't need this pattern.
|
|
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
|
|
|
# Maximum number of outline entries injected into the agent context.
|
|
# Keeps prompt size bounded even for very long documents.
|
|
MAX_OUTLINE_ENTRIES = 50
|
|
|
|
|
|
def extract_outline(md_path: Path) -> list[dict]:
|
|
"""Extract document outline (headings) from a Markdown file.
|
|
|
|
Recognises two heading styles produced by pymupdf4llm:
|
|
1. Standard Markdown headings: lines starting with one or more '#'
|
|
2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
|
|
(SEC filings use bold+caps for section headings with the same font size
|
|
as body text, so pymupdf4llm cannot promote them to # headings)
|
|
|
|
Args:
|
|
md_path: Path to the .md file.
|
|
|
|
Returns:
|
|
List of dicts with keys: title (str), line (int, 1-based).
|
|
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
|
|
``{"truncated": True}`` is appended as the last element so callers can
|
|
render a "showing first N headings" hint without re-scanning the file.
|
|
Returns an empty list if the file cannot be read or has no headings.
|
|
"""
|
|
outline: list[dict] = []
|
|
try:
|
|
with md_path.open(encoding="utf-8") as f:
|
|
for lineno, line in enumerate(f, 1):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
# Style 1: standard Markdown heading
|
|
if stripped.startswith("#"):
|
|
title = stripped.lstrip("#").strip()
|
|
# Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
|
|
if title:
|
|
if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
|
|
title = m2.group(1).strip()
|
|
outline.append({"title": title, "line": lineno})
|
|
|
|
# Style 2: bold-only line (entire line is **...**)
|
|
elif m := _BOLD_HEADING_RE.match(stripped):
|
|
title = m.group(1).strip()
|
|
if title:
|
|
outline.append({"title": title, "line": lineno})
|
|
|
|
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
|
outline.append({"truncated": True})
|
|
break
|
|
except Exception:
|
|
return []
|
|
|
|
return outline
|
|
|
|
|
|
def _get_pdf_converter() -> str:
|
|
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
|
|
try:
|
|
from deerflow.config.app_config import get_app_config
|
|
|
|
cfg = get_app_config()
|
|
uploads_cfg = getattr(cfg, "uploads", None)
|
|
if uploads_cfg is not None:
|
|
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
|
|
except Exception:
|
|
pass
|
|
return "auto"
|