fix(artifacts): 修复文件名差异导致 Artifact not found

This commit is contained in:
肖应宇 2026-04-17 13:50:44 +08:00
parent c667faad65
commit 33705637ea
4 changed files with 72 additions and 1 deletions

View File

@ -1,5 +1,7 @@
import logging import logging
import mimetypes import mimetypes
import re
import unicodedata
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from urllib.parse import quote from urllib.parse import quote
@ -19,6 +21,9 @@ ACTIVE_CONTENT_MIME_TYPES = {
"image/svg+xml", "image/svg+xml",
} }
_DASH_VARIANTS_RE = re.compile(r"\s*[-\u2010\u2011\u2012\u2013\u2014\u2212]\s*")
_WHITESPACE_RE = re.compile(r"\s+")
def _build_content_disposition(disposition_type: str, filename: str) -> str: def _build_content_disposition(disposition_type: str, filename: str) -> str:
"""Build an RFC 5987 encoded Content-Disposition header value.""" """Build an RFC 5987 encoded Content-Disposition header value."""
@ -32,6 +37,31 @@ def _build_attachment_headers(filename: str, extra_headers: dict[str, str] | Non
return headers return headers
def _canonicalize_filename_for_lookup(filename: str) -> str:
"""Canonical form used for conservative compatibility lookup."""
normalized = unicodedata.normalize("NFKC", filename).strip()
normalized = _DASH_VARIANTS_RE.sub("-", normalized)
normalized = _WHITESPACE_RE.sub(" ", normalized)
return normalized
def _find_compat_filename_match(missing_path: Path) -> Path | None:
"""Find a same-directory file whose canonicalized name uniquely matches."""
parent = missing_path.parent
if not parent.is_dir():
return None
target_name = _canonicalize_filename_for_lookup(missing_path.name)
matches: list[Path] = []
for candidate in parent.iterdir():
if not candidate.is_file():
continue
if _canonicalize_filename_for_lookup(candidate.name) == target_name:
matches.append(candidate)
return matches[0] if len(matches) == 1 else None
def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool: def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
"""Check if file is text by examining content for null bytes.""" """Check if file is text by examining content for null bytes."""
try: try:
@ -157,7 +187,15 @@ async def get_artifact(thread_id: str, path: str, request: Request, download: bo
logger.info(f"Resolving artifact path: thread_id={thread_id}, requested_path={path}, actual_path={actual_path}") logger.info(f"Resolving artifact path: thread_id={thread_id}, requested_path={path}, actual_path={actual_path}")
if not actual_path.exists(): if not actual_path.exists():
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}") compat_path = _find_compat_filename_match(actual_path)
if compat_path is None:
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")
logger.info(
"Artifact compatibility fallback applied: requested_path=%s, resolved_path=%s",
actual_path,
compat_path,
)
actual_path = compat_path
if not actual_path.is_file(): if not actual_path.is_file():
raise HTTPException(status_code=400, detail=f"Path is not a file: {path}") raise HTTPException(status_code=400, detail=f"Path is not a file: {path}")

View File

@ -56,6 +56,11 @@ def _normalize_presented_filepath(
except ValueError as exc: except ValueError as exc:
raise ValueError(f"Only files in {OUTPUTS_VIRTUAL_PREFIX} can be presented: {filepath}") from exc raise ValueError(f"Only files in {OUTPUTS_VIRTUAL_PREFIX} can be presented: {filepath}") from exc
if not actual_path.exists():
raise ValueError(f"File does not exist: {filepath}")
if not actual_path.is_file():
raise ValueError(f"Path is not a file: {filepath}")
return f"{OUTPUTS_VIRTUAL_PREFIX}/{relative_path.as_posix()}" return f"{OUTPUTS_VIRTUAL_PREFIX}/{relative_path.as_posix()}"

View File

@ -117,3 +117,16 @@ def test_get_artifact_pdf_with_no_null_bytes_and_non_utf8_content_is_served_inli
assert bytes(response.body) == binary_content assert bytes(response.body) == binary_content
assert response.media_type == "application/pdf" assert response.media_type == "application/pdf"
assert response.headers.get("content-disposition", "").startswith("inline;") assert response.headers.get("content-disposition", "").startswith("inline;")
def test_get_artifact_compat_fallback_for_dash_spacing(tmp_path, monkeypatch) -> None:
artifact_path = tmp_path / "xhs-note-唯-疲劳端茶.md"
artifact_path.write_text("ok", encoding="utf-8")
requested_path = tmp_path / "xhs-note-唯 - 疲劳端茶.md"
monkeypatch.setattr(artifacts_router, "resolve_thread_virtual_path", lambda _thread_id, _path: requested_path)
response = asyncio.run(artifacts_router.get_artifact("thread-1", "mnt/user-data/outputs/xhs-note-唯 - 疲劳端茶.md", _make_request()))
assert bytes(response.body).decode("utf-8") == "ok"
assert response.media_type == "text/markdown"

View File

@ -66,3 +66,18 @@ def test_present_files_rejects_paths_outside_outputs(tmp_path):
assert "artifacts" not in result.update assert "artifacts" not in result.update
assert result.update["messages"][0].content == f"Error: Only files in /mnt/user-data/outputs can be presented: {leaked_path}" assert result.update["messages"][0].content == f"Error: Only files in /mnt/user-data/outputs can be presented: {leaked_path}"
def test_present_files_rejects_nonexistent_file_in_outputs(tmp_path):
outputs_dir = tmp_path / "threads" / "thread-1" / "user-data" / "outputs"
outputs_dir.mkdir(parents=True)
missing_path = outputs_dir / "missing.md"
result = present_file_tool_module.present_file_tool.func(
runtime=_make_runtime(str(outputs_dir)),
filepaths=[str(missing_path)],
tool_call_id="tc-4",
)
assert "artifacts" not in result.update
assert result.update["messages"][0].content == f"Error: File does not exist: {missing_path}"