diff --git a/backend/packages/harness/deerflow/agents/lead_agent/prompt.py b/backend/packages/harness/deerflow/agents/lead_agent/prompt.py index 717ff9f9..e9164ffb 100644 --- a/backend/packages/harness/deerflow/agents/lead_agent/prompt.py +++ b/backend/packages/harness/deerflow/agents/lead_agent/prompt.py @@ -266,7 +266,9 @@ You: "Deploying to staging..." [proceed] **File Management:** - Uploaded files are automatically listed in the section before each request -- Use `read_file` tool to read uploaded files using their paths from the list +- Mentioned files are listed in the section when references are present +- Treat "files the user sent" as the conversation-level union of uploaded + mentioned files (deduplicated by file path) +- Use `read_file` tool to read listed files using their paths from the file-context sections - For PDF, PPT, Excel, and Word files, converted Markdown versions (*.md) are available alongside originals - All temporary work happens in `/mnt/user-data/workspace` - Final deliverables must be copied to `/mnt/user-data/outputs` and presented using `present_files` tool diff --git a/backend/packages/harness/deerflow/agents/memory/prompt.py b/backend/packages/harness/deerflow/agents/memory/prompt.py index 47b35e2a..320246cd 100644 --- a/backend/packages/harness/deerflow/agents/memory/prompt.py +++ b/backend/packages/harness/deerflow/agents/memory/prompt.py @@ -343,11 +343,15 @@ def format_conversation_for_update(messages: list[Any]) -> str: text_parts.append(text_val) content = " ".join(text_parts) if text_parts else str(content) - # Strip uploaded_files tags from human messages to avoid persisting - # ephemeral file path info into long-term memory. Skip the turn entirely - # when nothing remains after stripping (upload-only message). + # Strip file-context tags from human messages to avoid persisting + # ephemeral file path info into long-term memory. Skip the turn entirely + # when nothing remains after stripping (file-context-only message). if role == "human": - content = re.sub(r"[\s\S]*?\n*", "", str(content)).strip() + content = re.sub( + r"<(?:uploaded_files|mentioned_files|sent_files_semantics)>[\s\S]*?\n*", + "", + str(content), + ).strip() if not content: continue diff --git a/backend/packages/harness/deerflow/agents/memory/updater.py b/backend/packages/harness/deerflow/agents/memory/updater.py index 6e3f9481..405a90c0 100644 --- a/backend/packages/harness/deerflow/agents/memory/updater.py +++ b/backend/packages/harness/deerflow/agents/memory/updater.py @@ -213,6 +213,7 @@ _UPLOAD_SENTENCE_RE = re.compile( r"|/mnt/user-data/uploads/" r"|" r"|" + r"|" r")[^.!?]*[.!?]?\s*", re.IGNORECASE, ) diff --git a/backend/packages/harness/deerflow/agents/middlewares/memory_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/memory_middleware.py index b63e8be7..a0fc7c60 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/memory_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/memory_middleware.py @@ -15,7 +15,7 @@ from deerflow.config.memory_config import get_memory_config logger = logging.getLogger(__name__) _UPLOAD_BLOCK_RE = re.compile( - r"<(?:uploaded_files|mentioned_files)>[\s\S]*?\n*", + r"<(?:uploaded_files|mentioned_files|sent_files_semantics)>[\s\S]*?\n*", re.IGNORECASE, ) _CORRECTION_PATTERNS = ( diff --git a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py index 4459e9e4..7878995e 100644 --- a/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py +++ b/backend/packages/harness/deerflow/agents/middlewares/uploads_middleware.py @@ -145,6 +145,72 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): return "\n".join(lines) + def _merge_sent_files(self, uploaded_files: list[dict], mention_files: list[dict]) -> list[dict]: + """Build conversation-level sent-files view (uploads ∪ mentions, deduped by path).""" + + merged: dict[str, dict] = {} + + def _upsert(file: dict, source: str) -> None: + path = file.get("path") or "" + if not path: + return + entry = merged.get(path) + if entry is None: + entry = { + "filename": file.get("filename") or Path(path).name, + "path": path, + "size": int(file.get("size") or 0), + "sent_sources": set(), + } + merged[path] = entry + entry["sent_sources"].add(source) + entry["size"] = max(entry["size"], int(file.get("size") or 0)) + if source == "mention" and file.get("ref_source"): + entry["ref_source"] = file["ref_source"] + + for file in uploaded_files: + _upsert(file, "upload") + for file in mention_files: + _upsert(file, "mention") + + ordered = sorted( + merged.values(), + key=lambda f: (str(f.get("filename", "")).lower(), str(f.get("path", "")).lower()), + ) + for file in ordered: + sources = file.get("sent_sources") or set() + if "upload" in sources and "mention" in sources: + file["sent_source_label"] = "upload+mention" + elif "upload" in sources: + file["sent_source_label"] = "upload" + else: + file["sent_source_label"] = "mention" + return ordered + + def _create_sent_files_summary(self, sent_files: list[dict]) -> str: + """Create policy block describing unified 'sent files' semantics.""" + lines = [ + "", + "Conversation attachment semantics:", + "- Treat uploaded files and mentioned files as one unified concept of files the user has sent.", + "- For questions like 'what files did I send' or 'how many files did I send', use the conversation-level union of uploaded + mentioned files.", + "- Count unique files by path (deduplicated).", + "", + "Conversation-level sent files (deduplicated):", + ] + if sent_files: + for file in sent_files: + size_kb = file["size"] / 1024 + size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" + lines.append( + f"- {file['filename']} ({size_str}, source: {file['sent_source_label']})" + ) + lines.append(f" Path: {file['path']}") + else: + lines.append("- (none)") + lines.append("") + return "\n".join(lines) + def _mentioned_files_from_kwargs(self, message: HumanMessage) -> list[dict]: """Extract mention references from additional_kwargs.files. @@ -186,7 +252,7 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): return references def _create_mentions_message(self, mention_files: list[dict]) -> str: - lines = ["", "The following files were referenced in this message:", ""] + lines = ["", "The following files were referenced by the user in this conversation:", ""] for file in mention_files: size_kb = file["size"] / 1024 size_str = f"{size_kb:.1f} KB" if size_kb < 1024 else f"{size_kb / 1024:.1f} MB" @@ -199,6 +265,21 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): lines.append("") return "\n".join(lines) + def _mentioned_files_from_messages(self, messages: list) -> list[dict]: + """Extract mention references across conversation messages.""" + references: list[dict] = [] + seen: set[tuple[str, str]] = set() + for message in messages: + if not isinstance(message, HumanMessage): + continue + for file in self._mentioned_files_from_kwargs(message): + key = (file["filename"], file["path"]) + if key in seen: + continue + seen.add(key) + references.append(file) + return references + def _files_from_kwargs(self, message: HumanMessage, uploads_dir: Path | None = None) -> list[dict] | None: """Extract file info from message additional_kwargs.files. @@ -282,7 +363,7 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): # Get newly uploaded files from the current message's additional_kwargs.files new_files = self._files_from_kwargs(last_message, uploads_dir) or [] - mention_files = self._mentioned_files_from_kwargs(last_message) + mention_files = self._mentioned_files_from_messages(messages) # Collect historical files from the uploads directory (all except the new ones) new_filenames = {f["filename"] for f in new_files} @@ -311,13 +392,18 @@ class UploadsMiddleware(AgentMiddleware[UploadsMiddlewareState]): file["outline"] = outline file["outline_preview"] = preview - if not new_files and not historical_files and not mention_files: + sent_files = self._merge_sent_files(new_files + historical_files, mention_files) + + if not new_files and not historical_files and not mention_files and not sent_files: return None logger.debug(f"New files: {[f['filename'] for f in new_files]}, historical: {[f['filename'] for f in historical_files]}") # Create context message(s) and prepend to the last human message content. - message_parts = [self._create_files_message(new_files, historical_files)] + message_parts = [ + self._create_files_message(new_files, historical_files), + self._create_sent_files_summary(sent_files), + ] if mention_files: message_parts.append(self._create_mentions_message(mention_files)) files_message = "\n\n".join(message_parts) diff --git a/backend/tests/test_memory_updater.py b/backend/tests/test_memory_updater.py index 48fdfd89..4eae812f 100644 --- a/backend/tests/test_memory_updater.py +++ b/backend/tests/test_memory_updater.py @@ -510,6 +510,22 @@ class TestFormatConversationForUpdate: assert "raw user text" in result assert "structured text" in result + def test_strips_uploaded_mentioned_and_sent_semantics_tags(self): + msg = MagicMock() + msg.type = "human" + msg.content = ( + "\nfile list\n\n" + "\nsummary\n\n" + "\nmentions\n\n" + "actual question" + ) + + result = format_conversation_for_update([msg]) + assert "actual question" in result + assert "uploaded_files" not in result + assert "mentioned_files" not in result + assert "sent_files_semantics" not in result + # --------------------------------------------------------------------------- # update_memory - structured LLM response handling diff --git a/backend/tests/test_uploads_middleware_core_logic.py b/backend/tests/test_uploads_middleware_core_logic.py index 5b8fa1c4..e1089454 100644 --- a/backend/tests/test_uploads_middleware_core_logic.py +++ b/backend/tests/test_uploads_middleware_core_logic.py @@ -278,10 +278,91 @@ class TestBeforeAgent: assert result is not None content = result["messages"][-1].content assert "" in content - assert "referenced in this message" in content + assert "referenced by the user in this conversation" in content assert "/mnt/user-data/uploads/saten-ruiko.jpg" in content assert "Do not re-upload them." in content + def test_injects_sent_files_semantics_for_upload_and_mention_union(self, tmp_path): + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "uploaded.txt").write_bytes(b"u") + msg = _human( + "how many files did I send?", + files=[ + { + "filename": "uploaded.txt", + "size": 1, + "path": "/mnt/user-data/uploads/uploaded.txt", + }, + { + "filename": "mentioned.jpg", + "size": 0, + "path": "/mnt/user-data/outputs/mentioned.jpg", + "ref_kind": "mention", + "ref_source": "artifact", + }, + ], + ) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "" in content + assert "union of uploaded + mentioned files" in content + assert "uploaded.txt (0.0 KB, source: upload)" in content + assert "mentioned.jpg (0.0 KB, source: mention)" in content + + def test_sent_files_union_dedupes_same_file_path_and_marks_both(self, tmp_path): + mw = _middleware(tmp_path) + uploads_dir = _uploads_dir(tmp_path) + (uploads_dir / "same.txt").write_bytes(b"x") + msg = _human( + "count files", + files=[ + { + "filename": "same.txt", + "size": 1, + "path": "/mnt/user-data/uploads/same.txt", + }, + { + "filename": "same.txt", + "size": 1, + "path": "/mnt/user-data/uploads/same.txt", + "ref_kind": "mention", + "ref_source": "upload", + }, + ], + ) + result = mw.before_agent(self._state(msg), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "same.txt (0.0 KB, source: upload+mention)" in content + assert content.count("same.txt (0.0 KB, source: upload+mention)") == 1 + + def test_historical_mentions_are_included_for_follow_up_count_question(self, tmp_path): + mw = _middleware(tmp_path) + prev = _human( + "analyse this", + files=[ + { + "filename": "history.png", + "size": 0, + "path": "/mnt/user-data/outputs/history.png", + "ref_kind": "mention", + "ref_source": "artifact", + } + ], + ) + current = _human("我总共发送了多少个附件?") + result = mw.before_agent(self._state(prev, current), _runtime()) + + assert result is not None + content = result["messages"][-1].content + assert "" in content + assert "history.png" in content + assert "source: mention" in content + def test_mentioned_files_do_not_enter_uploaded_files_state(self, tmp_path): mw = _middleware(tmp_path) msg = _human( diff --git a/frontend/src/core/messages/utils.ts b/frontend/src/core/messages/utils.ts index f1eae285..c8f88768 100644 --- a/frontend/src/core/messages/utils.ts +++ b/frontend/src/core/messages/utils.ts @@ -337,12 +337,17 @@ export interface FileInMessage { } /** - * Strip tag from message content. - * Returns the content with the tag removed. + * Strip internal file-context tags from message content. + * Returns the content with these tags removed: + * - ... + * - ... + * - ... */ export function stripUploadedFilesTag(content: string): string { return content .replace(/[\s\S]*?<\/uploaded_files>/g, "") + .replace(/[\s\S]*?<\/mentioned_files>/g, "") + .replace(/[\s\S]*?<\/sent_files_semantics>/g, "") .trim(); }