feat(uploads): inject document outline into agent context for converted files (#1738)
* feat(uploads): inject document outline into agent context for converted files
Extract headings from converted .md files and inject them into the
<uploaded_files> context block so the agent can navigate large documents
by line number before reading.
- Add `extract_outline()` to `file_conversion.py`: recognises standard
Markdown headings (#/##/###) and SEC-style bold structural headings
(**ITEM N. BUSINESS**, **PART II**); caps at 50 entries; excludes
cover-page boilerplate (WASHINGTON DC, CURRENT REPORT, SIGNATURES)
- Add `_extract_outline_for_file()` helper in `uploads_middleware.py`:
looks for a sibling `.md` file produced by the conversion pipeline
- Update `UploadsMiddleware._create_files_message()` to render the outline
under each file entry with `L{line}: {title}` format and a `read_file`
prompt for range-based reading
- Tests: 10 new tests for `extract_outline()`, 4 new tests for outline
injection in `UploadsMiddleware`; existing test updated for new `outline`
field in `uploaded_files` state
Partially addresses #1647 (agent ignores uploaded files).
* fix(uploads): stream outline file reads and strip inline bold from heading titles
- Switch extract_outline() from read_text().splitlines() to open()+line iteration
so large converted documents are not loaded into memory on every agent turn;
exits as soon as MAX_OUTLINE_ENTRIES is reached (Copilot suggestion)
- Strip **...** wrapper from standard Markdown heading titles before appending
to outline so agent context stays clean (e.g. "## **Overview**" → "Overview")
(Copilot suggestion)
- Remove unused pathlib.Path import and fix import sort order in test_file_conversion.py
to satisfy ruff CI lint
* fix(uploads): show truncation hint when outline exceeds MAX_OUTLINE_ENTRIES
When extract_outline() hits the cap it now appends a sentinel entry
{"truncated": True} instead of silently dropping the rest of the headings.
UploadsMiddleware reads the sentinel and renders a hint line:
... (showing first 50 headings; use `read_file` to explore further)
Without this the agent had no way to know the outline was incomplete and
would treat the first 50 headings as the full document structure.
* fix(uploads): fall back to configurable.thread_id when runtime.context lacks thread_id
runtime.context does not always carry thread_id (depends on LangGraph
invocation path). ThreadDataMiddleware already falls back to
get_config().configurable.thread_id — apply the same pattern so
UploadsMiddleware can resolve the uploads directory and attach outlines
in all invocation paths.
* style: apply ruff format
---------
Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
parent
2b73c9314f
commit
87f41d3ae8
|
|
@ -10,10 +10,27 @@ from langchain_core.messages import HumanMessage
|
||||||
from langgraph.runtime import Runtime
|
from langgraph.runtime import Runtime
|
||||||
|
|
||||||
from deerflow.config.paths import Paths, get_paths
|
from deerflow.config.paths import Paths, get_paths
|
||||||
|
from deerflow.utils.file_conversion import extract_outline
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_outline_for_file(file_path: Path) -> list[dict]:
|
||||||
|
"""Return the document outline for *file_path* if a converted .md exists.
|
||||||
|
|
||||||
|
Looks for a sibling ``<stem>.md`` file produced by the upload conversion
|
||||||
|
pipeline. Returns an empty list when the file is not a converted document
|
||||||
|
or when no headings are found.
|
||||||
|
"""
|
||||||
|
md_path = file_path.with_suffix(".md")
|
||||||
|
if not md_path.is_file():
|
||||||
|
return []
|
||||||
|
outline = extract_outline(md_path)
|
||||||
|
if outline:
|
||||||
|
logger.debug("Extracted %d outline entries from %s", len(outline), file_path.name)
|
||||||
|
return outline
|
||||||
|
|
||||||
|
|
||||||
class UploadsMiddlewareState(AgentState):
|
class UploadsMiddlewareState(AgentState):
|
||||||
"""State schema for uploads middleware."""
|
"""State schema for uploads middleware."""
|
||||||
|
|
||||||
|
|
@ -39,12 +56,31 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._paths = Paths(base_dir) if base_dir else get_paths()
|
self._paths = Paths(base_dir) if base_dir else get_paths()
|
||||||
|
|
||||||
|
def _format_file_entry(self, file: dict, lines: list[str]) -> None:
|
||||||
|
"""Append a single file entry (name, size, path, optional outline) to lines."""
|
||||||
|
size_kb = file["size"] / 1024
|
||||||
|
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
||||||
|
lines.append(f"- {file['filename']} ({size_str})")
|
||||||
|
lines.append(f" Path: {file['path']}")
|
||||||
|
outline = file.get("outline") or []
|
||||||
|
if outline:
|
||||||
|
truncated = outline[-1].get("truncated", False) if outline else False
|
||||||
|
visible = [e for e in outline if not e.get("truncated")]
|
||||||
|
lines.append(" Document outline (use `read_file` with line ranges to read sections):")
|
||||||
|
for entry in visible:
|
||||||
|
lines.append(f" L{entry['line']}: {entry['title']}")
|
||||||
|
if truncated:
|
||||||
|
lines.append(f" ... (showing first {len(visible)} headings; use `read_file` to explore further)")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
|
def _create_files_message(self, new_files: list[dict], historical_files: list[dict]) -> str:
|
||||||
"""Create a formatted message listing uploaded files.
|
"""Create a formatted message listing uploaded files.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
new_files: Files uploaded in the current message.
|
new_files: Files uploaded in the current message.
|
||||||
historical_files: Files uploaded in previous messages.
|
historical_files: Files uploaded in previous messages.
|
||||||
|
Each file dict may contain an optional ``outline`` key — a list of
|
||||||
|
``{title, line}`` dicts extracted from the converted Markdown file.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Formatted string inside <uploaded_files> tags.
|
Formatted string inside <uploaded_files> tags.
|
||||||
|
|
@ -55,23 +91,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||||
lines.append("")
|
lines.append("")
|
||||||
if new_files:
|
if new_files:
|
||||||
for file in new_files:
|
for file in new_files:
|
||||||
size_kb = file["size"] / 1024
|
self._format_file_entry(file, lines)
|
||||||
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
|
||||||
lines.append(f"- {file['filename']} ({size_str})")
|
|
||||||
lines.append(f" Path: {file['path']}")
|
|
||||||
lines.append("")
|
|
||||||
else:
|
else:
|
||||||
lines.append("(empty)")
|
lines.append("(empty)")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
if historical_files:
|
if historical_files:
|
||||||
lines.append("The following files were uploaded in previous messages and are still available:")
|
lines.append("The following files were uploaded in previous messages and are still available:")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
for file in historical_files:
|
for file in historical_files:
|
||||||
size_kb = file["size"] / 1024
|
self._format_file_entry(file, lines)
|
||||||
size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB"
|
|
||||||
lines.append(f"- {file['filename']} ({size_str})")
|
|
||||||
lines.append(f" Path: {file['path']}")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
lines.append("You can read these files using the `read_file` tool with the paths shown above.")
|
lines.append("You can read these files using the `read_file` tool with the paths shown above.")
|
||||||
lines.append("</uploaded_files>")
|
lines.append("</uploaded_files>")
|
||||||
|
|
@ -172,9 +201,16 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]):
|
||||||
"size": stat.st_size,
|
"size": stat.st_size,
|
||||||
"path": f"/mnt/user-data/uploads/{file_path.name}",
|
"path": f"/mnt/user-data/uploads/{file_path.name}",
|
||||||
"extension": file_path.suffix,
|
"extension": file_path.suffix,
|
||||||
|
"outline": _extract_outline_for_file(file_path),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Attach outlines to new files as well
|
||||||
|
if uploads_dir:
|
||||||
|
for file in new_files:
|
||||||
|
phys_path = uploads_dir / file["filename"]
|
||||||
|
file["outline"] = _extract_outline_for_file(phys_path)
|
||||||
|
|
||||||
if not new_files and not historical_files:
|
if not new_files and not historical_files:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ No FastAPI or HTTP dependencies — pure utility functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -45,3 +46,90 @@ async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# Regex for bold-only lines that look like section headings.
|
||||||
|
# Targets SEC filing structural headings that pymupdf4llm renders as **bold**
|
||||||
|
# rather than # Markdown headings (because they use same font size as body text,
|
||||||
|
# distinguished only by bold+caps formatting).
|
||||||
|
#
|
||||||
|
# Pattern requires ALL of:
|
||||||
|
# 1. Entire line is a single **...** block (no surrounding prose)
|
||||||
|
# 2. Starts with a recognised structural keyword:
|
||||||
|
# - ITEM / PART / SECTION (with optional number/letter after)
|
||||||
|
# - SCHEDULE, EXHIBIT, APPENDIX, ANNEX, CHAPTER
|
||||||
|
# All-caps addresses, boilerplate ("CURRENT REPORT", "SIGNATURES",
|
||||||
|
# "WASHINGTON, DC 20549") do NOT start with these keywords and are excluded.
|
||||||
|
#
|
||||||
|
# Chinese headings (第三节...) are already captured as standard # headings
|
||||||
|
# by pymupdf4llm, so they don't need this pattern.
|
||||||
|
_BOLD_HEADING_RE = re.compile(r"^\*\*((ITEM|PART|SECTION|SCHEDULE|EXHIBIT|APPENDIX|ANNEX|CHAPTER)\b[A-Z0-9 .,\-]*)\*\*\s*$")
|
||||||
|
|
||||||
|
# Maximum number of outline entries injected into the agent context.
|
||||||
|
# Keeps prompt size bounded even for very long documents.
|
||||||
|
MAX_OUTLINE_ENTRIES = 50
|
||||||
|
|
||||||
|
|
||||||
|
def extract_outline(md_path: Path) -> list[dict]:
|
||||||
|
"""Extract document outline (headings) from a Markdown file.
|
||||||
|
|
||||||
|
Recognises two heading styles produced by pymupdf4llm:
|
||||||
|
1. Standard Markdown headings: lines starting with one or more '#'
|
||||||
|
2. Bold-only structural headings: **ITEM 1. BUSINESS**, **PART II**, etc.
|
||||||
|
(SEC filings use bold+caps for section headings with the same font size
|
||||||
|
as body text, so pymupdf4llm cannot promote them to # headings)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_path: Path to the .md file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts with keys: title (str), line (int, 1-based).
|
||||||
|
When the outline is truncated at MAX_OUTLINE_ENTRIES, a sentinel entry
|
||||||
|
``{"truncated": True}`` is appended as the last element so callers can
|
||||||
|
render a "showing first N headings" hint without re-scanning the file.
|
||||||
|
Returns an empty list if the file cannot be read or has no headings.
|
||||||
|
"""
|
||||||
|
outline: list[dict] = []
|
||||||
|
try:
|
||||||
|
with md_path.open(encoding="utf-8") as f:
|
||||||
|
for lineno, line in enumerate(f, 1):
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Style 1: standard Markdown heading
|
||||||
|
if stripped.startswith("#"):
|
||||||
|
title = stripped.lstrip("#").strip()
|
||||||
|
# Strip any inline **...** wrapping (e.g. "## **Overview**" → "Overview")
|
||||||
|
if title:
|
||||||
|
if m2 := re.fullmatch(r"\*\*(.+?)\*\*", title):
|
||||||
|
title = m2.group(1).strip()
|
||||||
|
outline.append({"title": title, "line": lineno})
|
||||||
|
|
||||||
|
# Style 2: bold-only line (entire line is **...**)
|
||||||
|
elif m := _BOLD_HEADING_RE.match(stripped):
|
||||||
|
title = m.group(1).strip()
|
||||||
|
if title:
|
||||||
|
outline.append({"title": title, "line": lineno})
|
||||||
|
|
||||||
|
if len(outline) >= MAX_OUTLINE_ENTRIES:
|
||||||
|
outline.append({"truncated": True})
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return outline
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pdf_converter() -> str:
|
||||||
|
"""Read pdf_converter setting from app config, defaulting to 'auto'."""
|
||||||
|
try:
|
||||||
|
from deerflow.config.app_config import get_app_config
|
||||||
|
|
||||||
|
cfg = get_app_config()
|
||||||
|
uploads_cfg = getattr(cfg, "uploads", None)
|
||||||
|
if uploads_cfg is not None:
|
||||||
|
return str(getattr(uploads_cfg, "pdf_converter", "auto"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return "auto"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,130 @@
|
||||||
|
"""Tests for extract_outline() in file_conversion utilities (PR2: document outline injection)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from deerflow.utils.file_conversion import (
|
||||||
|
MAX_OUTLINE_ENTRIES,
|
||||||
|
extract_outline,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_outline
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractOutline:
|
||||||
|
"""Tests for extract_outline()."""
|
||||||
|
|
||||||
|
def test_empty_file_returns_empty(self, tmp_path):
|
||||||
|
"""Empty markdown file yields no outline entries."""
|
||||||
|
md = tmp_path / "empty.md"
|
||||||
|
md.write_text("", encoding="utf-8")
|
||||||
|
assert extract_outline(md) == []
|
||||||
|
|
||||||
|
def test_missing_file_returns_empty(self, tmp_path):
|
||||||
|
"""Non-existent path returns [] without raising."""
|
||||||
|
assert extract_outline(tmp_path / "nonexistent.md") == []
|
||||||
|
|
||||||
|
def test_standard_markdown_headings(self, tmp_path):
|
||||||
|
"""# / ## / ### headings are all recognised."""
|
||||||
|
md = tmp_path / "doc.md"
|
||||||
|
md.write_text(
|
||||||
|
"# Chapter One\n\nSome text.\n\n## Section 1.1\n\nMore text.\n\n### Sub 1.1.1\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 3
|
||||||
|
assert outline[0] == {"title": "Chapter One", "line": 1}
|
||||||
|
assert outline[1] == {"title": "Section 1.1", "line": 5}
|
||||||
|
assert outline[2] == {"title": "Sub 1.1.1", "line": 9}
|
||||||
|
|
||||||
|
def test_bold_sec_item_heading(self, tmp_path):
|
||||||
|
"""**ITEM N. TITLE** lines in SEC filings are recognised."""
|
||||||
|
md = tmp_path / "10k.md"
|
||||||
|
md.write_text(
|
||||||
|
"Cover page text.\n\n**ITEM 1. BUSINESS**\n\nBody.\n\n**ITEM 1A. RISK FACTORS**\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 2
|
||||||
|
assert outline[0] == {"title": "ITEM 1. BUSINESS", "line": 3}
|
||||||
|
assert outline[1] == {"title": "ITEM 1A. RISK FACTORS", "line": 7}
|
||||||
|
|
||||||
|
def test_bold_part_heading(self, tmp_path):
|
||||||
|
"""**PART I** / **PART II** headings are recognised."""
|
||||||
|
md = tmp_path / "10k.md"
|
||||||
|
md.write_text("**PART I**\n\n**PART II**\n\n**PART III**\n", encoding="utf-8")
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 3
|
||||||
|
titles = [e["title"] for e in outline]
|
||||||
|
assert "PART I" in titles
|
||||||
|
assert "PART II" in titles
|
||||||
|
assert "PART III" in titles
|
||||||
|
|
||||||
|
def test_sec_cover_page_boilerplate_excluded(self, tmp_path):
|
||||||
|
"""Address lines and short cover boilerplate must NOT appear in outline."""
|
||||||
|
md = tmp_path / "8k.md"
|
||||||
|
md.write_text(
|
||||||
|
"## **UNITED STATES SECURITIES AND EXCHANGE COMMISSION**\n\n**WASHINGTON, DC 20549**\n\n**CURRENT REPORT**\n\n**SIGNATURES**\n\n**TESLA, INC.**\n\n**ITEM 2.02. RESULTS OF OPERATIONS**\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
titles = [e["title"] for e in outline]
|
||||||
|
# Cover-page boilerplate should be excluded
|
||||||
|
assert "WASHINGTON, DC 20549" not in titles
|
||||||
|
assert "CURRENT REPORT" not in titles
|
||||||
|
assert "SIGNATURES" not in titles
|
||||||
|
assert "TESLA, INC." not in titles
|
||||||
|
# Real SEC heading must be included
|
||||||
|
assert "ITEM 2.02. RESULTS OF OPERATIONS" in titles
|
||||||
|
|
||||||
|
def test_chinese_headings_via_standard_markdown(self, tmp_path):
|
||||||
|
"""Chinese annual report headings emitted as # by pymupdf4llm are captured."""
|
||||||
|
md = tmp_path / "annual.md"
|
||||||
|
md.write_text(
|
||||||
|
"# 第一节 公司简介\n\n内容。\n\n## 第三节 管理层讨论与分析\n\n分析内容。\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 2
|
||||||
|
assert outline[0]["title"] == "第一节 公司简介"
|
||||||
|
assert outline[1]["title"] == "第三节 管理层讨论与分析"
|
||||||
|
|
||||||
|
def test_outline_capped_at_max_entries(self, tmp_path):
|
||||||
|
"""When truncated, result has MAX_OUTLINE_ENTRIES real entries + 1 sentinel."""
|
||||||
|
lines = [f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 10)]
|
||||||
|
md = tmp_path / "long.md"
|
||||||
|
md.write_text("\n".join(lines), encoding="utf-8")
|
||||||
|
outline = extract_outline(md)
|
||||||
|
# Last entry is the truncation sentinel
|
||||||
|
assert outline[-1] == {"truncated": True}
|
||||||
|
# Visible entries are exactly MAX_OUTLINE_ENTRIES
|
||||||
|
visible = [e for e in outline if not e.get("truncated")]
|
||||||
|
assert len(visible) == MAX_OUTLINE_ENTRIES
|
||||||
|
|
||||||
|
def test_no_truncation_sentinel_when_under_limit(self, tmp_path):
|
||||||
|
"""Short documents produce no sentinel entry."""
|
||||||
|
lines = [f"# Heading {i}" for i in range(5)]
|
||||||
|
md = tmp_path / "short.md"
|
||||||
|
md.write_text("\n".join(lines), encoding="utf-8")
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 5
|
||||||
|
assert not any(e.get("truncated") for e in outline)
|
||||||
|
|
||||||
|
def test_blank_lines_and_whitespace_ignored(self, tmp_path):
|
||||||
|
"""Blank lines between headings do not produce empty entries."""
|
||||||
|
md = tmp_path / "spaced.md"
|
||||||
|
md.write_text("\n\n# Title One\n\n\n\n# Title Two\n\n", encoding="utf-8")
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert len(outline) == 2
|
||||||
|
assert all(e["title"] for e in outline)
|
||||||
|
|
||||||
|
def test_inline_bold_not_confused_with_heading(self, tmp_path):
|
||||||
|
"""Mid-sentence bold text must not be mistaken for a heading."""
|
||||||
|
md = tmp_path / "prose.md"
|
||||||
|
md.write_text(
|
||||||
|
"This sentence has **bold words** inside it.\n\nAnother with **MULTIPLE CAPS** inline.\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
outline = extract_outline(md)
|
||||||
|
assert outline == []
|
||||||
|
|
@ -289,6 +289,7 @@ class TestBeforeAgent:
|
||||||
"size": 5,
|
"size": 5,
|
||||||
"path": "/mnt/user-data/uploads/notes.txt",
|
"path": "/mnt/user-data/uploads/notes.txt",
|
||||||
"extension": ".txt",
|
"extension": ".txt",
|
||||||
|
"outline": [],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -339,3 +340,92 @@ class TestBeforeAgent:
|
||||||
result = mw.before_agent(self._state(msg), _runtime())
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
assert result["messages"][-1].id == "original-id-42"
|
assert result["messages"][-1].id == "original-id-42"
|
||||||
|
|
||||||
|
def test_outline_injected_when_md_file_exists(self, tmp_path):
|
||||||
|
"""When a converted .md file exists alongside the upload, its outline is injected."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "report.pdf").write_bytes(b"%PDF fake")
|
||||||
|
# Simulate the .md produced by the conversion pipeline
|
||||||
|
(uploads_dir / "report.md").write_text(
|
||||||
|
"# PART I\n\n## ITEM 1. BUSINESS\n\nBody text.\n\n## ITEM 2. RISK\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
msg = _human("summarise", files=[{"filename": "report.pdf", "size": 9, "path": "/mnt/user-data/uploads/report.pdf"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert "Document outline" in content
|
||||||
|
assert "PART I" in content
|
||||||
|
assert "ITEM 1. BUSINESS" in content
|
||||||
|
assert "ITEM 2. RISK" in content
|
||||||
|
assert "read_file" in content
|
||||||
|
|
||||||
|
def test_no_outline_when_no_md_file(self, tmp_path):
|
||||||
|
"""Files without a sibling .md have no outline section."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "data.xlsx").write_bytes(b"fake-xlsx")
|
||||||
|
|
||||||
|
msg = _human("analyse", files=[{"filename": "data.xlsx", "size": 9, "path": "/mnt/user-data/uploads/data.xlsx"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert "Document outline" not in content
|
||||||
|
|
||||||
|
def test_outline_truncation_hint_shown(self, tmp_path):
|
||||||
|
"""When outline is truncated, a hint line is appended after the last visible entry."""
|
||||||
|
from deerflow.utils.file_conversion import MAX_OUTLINE_ENTRIES
|
||||||
|
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "big.pdf").write_bytes(b"%PDF fake")
|
||||||
|
# Write MAX_OUTLINE_ENTRIES + 5 headings so truncation is triggered
|
||||||
|
headings = "\n".join(f"# Heading {i}" for i in range(MAX_OUTLINE_ENTRIES + 5))
|
||||||
|
(uploads_dir / "big.md").write_text(headings, encoding="utf-8")
|
||||||
|
|
||||||
|
msg = _human("read", files=[{"filename": "big.pdf", "size": 9, "path": "/mnt/user-data/uploads/big.pdf"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert f"showing first {MAX_OUTLINE_ENTRIES} headings" in content
|
||||||
|
assert "use `read_file` to explore further" in content
|
||||||
|
|
||||||
|
def test_no_truncation_hint_for_short_outline(self, tmp_path):
|
||||||
|
"""Short outlines (under the cap) must not show a truncation hint."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
(uploads_dir / "short.pdf").write_bytes(b"%PDF fake")
|
||||||
|
(uploads_dir / "short.md").write_text("# Intro\n\n# Conclusion\n", encoding="utf-8")
|
||||||
|
|
||||||
|
msg = _human("read", files=[{"filename": "short.pdf", "size": 9, "path": "/mnt/user-data/uploads/short.pdf"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert "showing first" not in content
|
||||||
|
|
||||||
|
def test_historical_file_outline_injected(self, tmp_path):
|
||||||
|
"""Outline is also shown for historical (previously uploaded) files."""
|
||||||
|
mw = _middleware(tmp_path)
|
||||||
|
uploads_dir = _uploads_dir(tmp_path)
|
||||||
|
# Historical file with .md
|
||||||
|
(uploads_dir / "old_report.pdf").write_bytes(b"%PDF old")
|
||||||
|
(uploads_dir / "old_report.md").write_text(
|
||||||
|
"# Chapter 1\n\n# Chapter 2\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
# New file without .md
|
||||||
|
(uploads_dir / "new.txt").write_bytes(b"new")
|
||||||
|
|
||||||
|
msg = _human("go", files=[{"filename": "new.txt", "size": 3, "path": "/mnt/user-data/uploads/new.txt"}])
|
||||||
|
result = mw.before_agent(self._state(msg), _runtime())
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
content = result["messages"][-1].content
|
||||||
|
assert "Chapter 1" in content
|
||||||
|
assert "Chapter 2" in content
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue