fix(artifacts): 修复文件名差异导致 Artifact not found

This commit is contained in:
肖应宇 2026-04-17 13:50:44 +08:00
parent c667faad65
commit 33705637ea
4 changed files with 72 additions and 1 deletions

View File

@ -1,5 +1,7 @@
import logging
import mimetypes
import re
import unicodedata
import zipfile
from pathlib import Path
from urllib.parse import quote
@ -19,6 +21,9 @@ ACTIVE_CONTENT_MIME_TYPES = {
"image/svg+xml",
}
_DASH_VARIANTS_RE = re.compile(r"\s*[-\u2010\u2011\u2012\u2013\u2014\u2212]\s*")
_WHITESPACE_RE = re.compile(r"\s+")
def _build_content_disposition(disposition_type: str, filename: str) -> str:
"""Build an RFC 5987 encoded Content-Disposition header value."""
@ -32,6 +37,31 @@ def _build_attachment_headers(filename: str, extra_headers: dict[str, str] | Non
return headers
def _canonicalize_filename_for_lookup(filename: str) -> str:
"""Canonical form used for conservative compatibility lookup."""
normalized = unicodedata.normalize("NFKC", filename).strip()
normalized = _DASH_VARIANTS_RE.sub("-", normalized)
normalized = _WHITESPACE_RE.sub(" ", normalized)
return normalized
def _find_compat_filename_match(missing_path: Path) -> Path | None:
"""Find a same-directory file whose canonicalized name uniquely matches."""
parent = missing_path.parent
if not parent.is_dir():
return None
target_name = _canonicalize_filename_for_lookup(missing_path.name)
matches: list[Path] = []
for candidate in parent.iterdir():
if not candidate.is_file():
continue
if _canonicalize_filename_for_lookup(candidate.name) == target_name:
matches.append(candidate)
return matches[0] if len(matches) == 1 else None
def is_text_file_by_content(path: Path, sample_size: int = 8192) -> bool:
"""Check if file is text by examining content for null bytes."""
try:
@ -157,7 +187,15 @@ async def get_artifact(thread_id: str, path: str, request: Request, download: bo
logger.info(f"Resolving artifact path: thread_id={thread_id}, requested_path={path}, actual_path={actual_path}")
if not actual_path.exists():
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")
compat_path = _find_compat_filename_match(actual_path)
if compat_path is None:
raise HTTPException(status_code=404, detail=f"Artifact not found: {path}")
logger.info(
"Artifact compatibility fallback applied: requested_path=%s, resolved_path=%s",
actual_path,
compat_path,
)
actual_path = compat_path
if not actual_path.is_file():
raise HTTPException(status_code=400, detail=f"Path is not a file: {path}")

View File

@ -56,6 +56,11 @@ def _normalize_presented_filepath(
except ValueError as exc:
raise ValueError(f"Only files in {OUTPUTS_VIRTUAL_PREFIX} can be presented: {filepath}") from exc
if not actual_path.exists():
raise ValueError(f"File does not exist: {filepath}")
if not actual_path.is_file():
raise ValueError(f"Path is not a file: {filepath}")
return f"{OUTPUTS_VIRTUAL_PREFIX}/{relative_path.as_posix()}"

View File

@ -117,3 +117,16 @@ def test_get_artifact_pdf_with_no_null_bytes_and_non_utf8_content_is_served_inli
assert bytes(response.body) == binary_content
assert response.media_type == "application/pdf"
assert response.headers.get("content-disposition", "").startswith("inline;")
def test_get_artifact_compat_fallback_for_dash_spacing(tmp_path, monkeypatch) -> None:
artifact_path = tmp_path / "xhs-note-唯-疲劳端茶.md"
artifact_path.write_text("ok", encoding="utf-8")
requested_path = tmp_path / "xhs-note-唯 - 疲劳端茶.md"
monkeypatch.setattr(artifacts_router, "resolve_thread_virtual_path", lambda _thread_id, _path: requested_path)
response = asyncio.run(artifacts_router.get_artifact("thread-1", "mnt/user-data/outputs/xhs-note-唯 - 疲劳端茶.md", _make_request()))
assert bytes(response.body).decode("utf-8") == "ok"
assert response.media_type == "text/markdown"

View File

@ -66,3 +66,18 @@ def test_present_files_rejects_paths_outside_outputs(tmp_path):
assert "artifacts" not in result.update
assert result.update["messages"][0].content == f"Error: Only files in /mnt/user-data/outputs can be presented: {leaked_path}"
def test_present_files_rejects_nonexistent_file_in_outputs(tmp_path):
outputs_dir = tmp_path / "threads" / "thread-1" / "user-data" / "outputs"
outputs_dir.mkdir(parents=True)
missing_path = outputs_dir / "missing.md"
result = present_file_tool_module.present_file_tool.func(
runtime=_make_runtime(str(outputs_dir)),
filepaths=[str(missing_path)],
tool_call_id="tc-4",
)
assert "artifacts" not in result.update
assert result.update["messages"][0].content == f"Error: File does not exist: {missing_path}"