# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT import hashlib import logging import re import time from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence, Set from langchain_milvus.vectorstores import Milvus as LangchainMilvus from langchain_openai import OpenAIEmbeddings from openai import OpenAI from pymilvus import CollectionSchema, DataType, FieldSchema, MilvusClient from src.config.loader import get_bool_env, get_int_env, get_str_env from src.rag.retriever import Chunk, Document, Resource, Retriever logger = logging.getLogger(__name__) class DashscopeEmbeddings: """OpenAI-compatible embeddings wrapper.""" def __init__(self, **kwargs: Any) -> None: self._client: OpenAI = OpenAI( api_key=kwargs.get("api_key", ""), base_url=kwargs.get("base_url", "") ) self._model: str = kwargs.get("model", "") self._encoding_format: str = kwargs.get("encoding_format", "float") def _embed(self, texts: Sequence[str]) -> List[List[float]]: """Internal helper performing the embedding API call.""" clean_texts = [t if isinstance(t, str) else str(t) for t in texts] if not clean_texts: return [] resp = self._client.embeddings.create( model=self._model, input=clean_texts, encoding_format=self._encoding_format, ) return [d.embedding for d in resp.data] def embed_query(self, text: str) -> List[float]: """Return embedding for a given text.""" embeddings = self._embed([text]) return embeddings[0] if embeddings else [] def embed_documents(self, texts: List[str]) -> List[List[float]]: """Return embeddings for multiple documents (LangChain interface).""" return self._embed(texts) class MilvusRetriever(Retriever): """Retriever implementation backed by a Milvus vector store. Responsibilities: * Initialize / lazily connect to Milvus (local Lite or remote server). * Provide methods for inserting content chunks & querying similarity. * Optionally surface example markdown resources found in the project. Environment variables (selected): MILVUS_URI: Connection URI or local *.db path for Milvus Lite. MILVUS_COLLECTION: Target collection name (default: documents). MILVUS_TOP_K: Result set size (default: 10). MILVUS_EMBEDDING_PROVIDER: openai | dashscope (default: openai). MILVUS_EMBEDDING_MODEL: Embedding model name. MILVUS_EMBEDDING_DIM: Override embedding dimensionality. MILVUS_AUTO_LOAD_EXAMPLES: Load example *.md files if true. MILVUS_EXAMPLES_DIR: Folder containing example markdown files. """ def __init__(self) -> None: # --- Connection / collection configuration --- self.uri: str = get_str_env("MILVUS_URI", "http://localhost:19530") self.user: str = get_str_env("MILVUS_USER") self.password: str = get_str_env("MILVUS_PASSWORD") self.collection_name: str = get_str_env("MILVUS_COLLECTION", "documents") # --- Search configuration --- top_k_raw = get_str_env("MILVUS_TOP_K", "10") self.top_k: int = int(top_k_raw) if top_k_raw.isdigit() else 10 # --- Vector field names --- self.vector_field: str = get_str_env("MILVUS_VECTOR_FIELD", "embedding") self.id_field: str = get_str_env("MILVUS_ID_FIELD", "id") self.content_field: str = get_str_env("MILVUS_CONTENT_FIELD", "content") self.title_field: str = get_str_env("MILVUS_TITLE_FIELD", "title") self.url_field: str = get_str_env("MILVUS_URL_FIELD", "url") self.metadata_field: str = get_str_env("MILVUS_METADATA_FIELD", "metadata") # --- Embedding configuration --- self.embedding_model = get_str_env("MILVUS_EMBEDDING_MODEL") self.embedding_api_key = get_str_env("MILVUS_EMBEDDING_API_KEY") self.embedding_base_url = get_str_env("MILVUS_EMBEDDING_BASE_URL") self.embedding_dim: int = self._get_embedding_dimension(self.embedding_model) self.embedding_provider = get_str_env("MILVUS_EMBEDDING_PROVIDER", "openai") # --- Examples / auto-load configuration --- self.auto_load_examples: bool = get_bool_env("MILVUS_AUTO_LOAD_EXAMPLES", True) self.examples_dir: str = get_str_env("MILVUS_EXAMPLES_DIR", "examples") # chunk size self.chunk_size: int = get_int_env("MILVUS_CHUNK_SIZE", 4000) # --- Embedding model initialization --- self._init_embedding_model() # Client (MilvusClient or LangchainMilvus) created lazily self.client: Any = None def _init_embedding_model(self) -> None: """Initialize the embedding model based on configuration.""" kwargs = { "api_key": self.embedding_api_key, "model": self.embedding_model, "base_url": self.embedding_base_url, "encoding_format": "float", "dimensions": self.embedding_dim, } if self.embedding_provider.lower() == "openai": self.embedding_model = OpenAIEmbeddings(**kwargs) elif self.embedding_provider.lower() == "dashscope": self.embedding_model = DashscopeEmbeddings(**kwargs) else: raise ValueError( f"Unsupported embedding provider: {self.embedding_provider}. " "Supported providers: openai, dashscope" ) def _get_embedding_dimension(self, model_name: str) -> int: """Return embedding dimension for the supplied model name.""" # Common OpenAI embedding model dimensions embedding_dims = { "text-embedding-ada-002": 1536, "text-embedding-v4": 2048, } # Check if user has explicitly set the dimension explicit_dim = get_int_env("MILVUS_EMBEDDING_DIM", 0) if explicit_dim > 0: return explicit_dim # Return the dimension for the specified model return embedding_dims.get(model_name, 1536) # Default to 1536 def _create_collection_schema(self) -> CollectionSchema: """Build and return a Milvus ``CollectionSchema`` object with metadata field. Attempts to use a JSON field for metadata; falls back to VARCHAR if JSON type isn't supported in the deployment. """ fields = [ FieldSchema( name=self.id_field, dtype=DataType.VARCHAR, max_length=512, is_primary=True, auto_id=False, ), FieldSchema( name=self.vector_field, dtype=DataType.FLOAT_VECTOR, dim=self.embedding_dim, ), FieldSchema( name=self.content_field, dtype=DataType.VARCHAR, max_length=65535 ), FieldSchema(name=self.title_field, dtype=DataType.VARCHAR, max_length=512), FieldSchema(name=self.url_field, dtype=DataType.VARCHAR, max_length=1024), ] schema = CollectionSchema( fields=fields, description=f"Collection for DeerFlow RAG documents: {self.collection_name}", enable_dynamic_field=True, # Allow additional dynamic metadata fields ) return schema def _ensure_collection_exists(self) -> None: """Ensure the configured collection exists (create if missing). For Milvus Lite we create the collection manually; for the remote (LangChain) client we rely on LangChain's internal logic. """ if self._is_milvus_lite(): # For Milvus Lite, use MilvusClient try: # Check if collection exists collections = self.client.list_collections() if self.collection_name not in collections: # Create collection schema = self._create_collection_schema() self.client.create_collection( collection_name=self.collection_name, schema=schema, index_params={ "field_name": self.vector_field, "index_type": "IVF_FLAT", "metric_type": "IP", "params": {"nlist": 1024}, }, ) logger.info("Created Milvus collection: %s", self.collection_name) except Exception as e: logger.warning("Could not ensure collection exists: %s", e) else: # For LangChain Milvus, collection creation is handled automatically logger.warning( "Could not ensure collection exists: %s", self.collection_name ) def _load_example_files(self) -> None: """Load example markdown files into the collection (idempotent). Each markdown file is split into chunks and inserted only if a chunk with the derived document id hasn't been previously stored. """ try: # Get the project root directory current_file = Path(__file__) project_root = current_file.parent.parent.parent # Go up to project root examples_path = project_root / self.examples_dir if not examples_path.exists(): logger.info("Examples directory not found: %s", examples_path) return logger.info("Loading example files from: %s", examples_path) # Find all markdown files md_files = list(examples_path.glob("*.md")) if not md_files: logger.info("No markdown files found in examples directory") return # Check if files are already loaded existing_docs = self._get_existing_document_ids() loaded_count = 0 for md_file in md_files: doc_id = self._generate_doc_id(md_file) # Skip if already loaded if doc_id in existing_docs: continue try: # Read and process the file content = md_file.read_text(encoding="utf-8") title = self._extract_title_from_markdown(content, md_file.name) # Split content into chunks if it's too long chunks = self._split_content(content) # Insert each chunk for i, chunk in enumerate(chunks): chunk_id = f"{doc_id}_chunk_{i}" if len(chunks) > 1 else doc_id self._insert_document_chunk( doc_id=chunk_id, content=chunk, title=title, url=f"milvus://{self.collection_name}/{md_file.name}", metadata={"source": "examples", "file": md_file.name}, ) loaded_count += 1 logger.debug("Loaded example markdown: %s", md_file.name) except Exception as e: logger.warning("Error loading %s: %s", md_file.name, e) logger.info( "Successfully loaded %d example files into Milvus", loaded_count ) except Exception as e: logger.error("Error loading example files: %s", e) def _generate_doc_id(self, file_path: Path) -> str: """Return a stable identifier derived from name, size & mtime hash.""" # Use file name and size for a simple but effective ID file_stat = file_path.stat() content_hash = hashlib.md5( f"{file_path.name}_{file_stat.st_size}_{file_stat.st_mtime}".encode() ).hexdigest()[:8] return f"example_{file_path.stem}_{content_hash}" def _extract_title_from_markdown(self, content: str, filename: str) -> str: """Extract the first level-1 heading; else derive from file name.""" lines = content.split("\n") for line in lines: line = line.strip() if line.startswith("# "): return line[2:].strip() # Fallback to filename without extension return filename.replace(".md", "").replace("_", " ").title() def _split_content(self, content: str) -> List[str]: """Split long markdown text into paragraph-based chunks.""" if len(content) <= self.chunk_size: return [content] chunks = [] paragraphs = content.split("\n\n") current_chunk = "" for paragraph in paragraphs: if len(current_chunk) + len(paragraph) <= self.chunk_size: current_chunk += paragraph + "\n\n" else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = paragraph + "\n\n" if current_chunk: chunks.append(current_chunk.strip()) return chunks def _get_existing_document_ids(self) -> Set[str]: """Return set of existing document identifiers in the collection.""" try: if self._is_milvus_lite(): results = self.client.query( collection_name=self.collection_name, filter="", output_fields=[self.id_field], limit=10000, ) return { result.get(self.id_field, "") for result in results if result.get(self.id_field) } else: # For LangChain Milvus, we can't easily query all IDs # Return empty set to allow re-insertion (LangChain will handle duplicates) return set() except Exception: return set() def _insert_document_chunk( self, doc_id: str, content: str, title: str, url: str, metadata: Dict[str, Any] ) -> None: """Insert a single content chunk into Milvus.""" try: # Generate embedding embedding = self._get_embedding(content) if self._is_milvus_lite(): # For Milvus Lite, use MilvusClient data = [ { self.id_field: doc_id, self.vector_field: embedding, self.content_field: content, self.title_field: title, self.url_field: url, **metadata, # Add metadata fields } ] self.client.insert(collection_name=self.collection_name, data=data) else: # For LangChain Milvus, use add_texts self.client.add_texts( texts=[content], metadatas=[ { self.id_field: doc_id, self.title_field: title, self.url_field: url, **metadata, } ], ) except Exception as e: raise RuntimeError(f"Failed to insert document chunk: {str(e)}") def _connect(self) -> None: """Create the underlying Milvus client (idempotent).""" try: # Check if using Milvus Lite (file-based) vs server-based Milvus if self._is_milvus_lite(): # Use MilvusClient for Milvus Lite (local file database) self.client = MilvusClient(self.uri) # Ensure collection exists self._ensure_collection_exists() else: connection_args = { "uri": self.uri, } # Add user/password only if provided if self.user: connection_args["user"] = self.user if self.password: connection_args["password"] = self.password # Create LangChain client (it will handle collection creation automatically) self.client = LangchainMilvus( embedding_function=self.embedding_model, collection_name=self.collection_name, connection_args=connection_args, # optional (if collection already exists with different schema, be careful) drop_old=False, ) except Exception as e: raise ConnectionError(f"Failed to connect to Milvus: {str(e)}") def _is_milvus_lite(self) -> bool: """Return True if the URI points to a local Milvus Lite file. Milvus Lite uses local file paths (often ``*.db``) without an HTTP/HTTPS scheme. We treat any path not containing a protocol and not starting with an HTTP(S) prefix as a Lite instance. """ return self.uri.endswith(".db") or ( not self.uri.startswith(("http://", "https://")) and "://" not in self.uri ) def _get_embedding(self, text: str) -> List[float]: """Return embedding for a given text.""" try: # Validate input if not isinstance(text, str): raise ValueError(f"Text must be a string, got {type(text)}") if not text.strip(): raise ValueError("Text cannot be empty or only whitespace") # Unified embedding interface (OpenAIEmbeddings or DashscopeEmbeddings wrapper) embeddings = self.embedding_model.embed_query(text=text.strip()) # Validate output if not isinstance(embeddings, list) or not embeddings: raise ValueError(f"Invalid embedding format: {type(embeddings)}") return embeddings except Exception as e: raise RuntimeError(f"Failed to generate embedding: {str(e)}") def list_resources(self, query: Optional[str] = None) -> List[Resource]: """List available resource summaries. Strategy: 1. If connected to Milvus Lite: query stored document metadata. 2. If LangChain client: perform a lightweight similarity search using either the provided ``query`` or a zero vector to fetch candidate docs (mocked in tests). 3. Append local markdown example titles (non-ingested) for user discoverability. Args: query: Optional search text to bias resource ordering. Returns: List of ``Resource`` objects. """ resources: List[Resource] = [] # Ensure connection established if not self.client: try: self._connect() except Exception: # Fall back to only local examples if connection fails return self._list_local_markdown_resources() try: if self._is_milvus_lite(): # Query limited metadata. Empty filter returns up to limit docs. results = self.client.query( collection_name=self.collection_name, filter="source == 'examples'", output_fields=[self.id_field, self.title_field, self.url_field], limit=100, ) for r in results: resources.append( Resource( uri=r.get(self.url_field, "") or f"milvus://{r.get(self.id_field, '')}", title=r.get(self.title_field, "") or r.get(self.id_field, "Unnamed"), description="Stored Milvus document", ) ) else: # Use similarity_search_by_vector for lightweight listing. # If a query is provided embed it; else use a zero vector. docs: Iterable[Any] = self.client.similarity_search( query, k=100, expr="source == 'examples'", # Limit to 100 results ) for d in docs: meta = getattr(d, "metadata", {}) or {} # check if the resource is in the list of resources if resources and any( r.uri == meta.get(self.url_field, "") or r.uri == f"milvus://{meta.get(self.id_field, '')}" for r in resources ): continue resources.append( Resource( uri=meta.get(self.url_field, "") or f"milvus://{meta.get(self.id_field, '')}", title=meta.get(self.title_field, "") or meta.get(self.id_field, "Unnamed"), description="Stored Milvus document", ) ) logger.info( "Succeed listed %d resources from Milvus collection: %s", len(resources), self.collection_name, ) except Exception: logger.warning( "Failed to query Milvus for resources, falling back to local examples." ) # Fall back to only local examples if connection fails return self._list_local_markdown_resources() return resources def _list_local_markdown_resources(self) -> List[Resource]: """Return local example markdown files as ``Resource`` objects. These are surfaced even when not ingested so users can choose to load them. Controlled by directory presence only (lightweight).""" current_file = Path(__file__) project_root = current_file.parent.parent.parent # up to project root examples_path = project_root / self.examples_dir if not examples_path.exists(): return [] md_files = list(examples_path.glob("*.md")) resources: list[Resource] = [] for md_file in md_files: try: content = md_file.read_text(encoding="utf-8", errors="ignore") title = self._extract_title_from_markdown(content, md_file.name) uri = f"milvus://{self.collection_name}/{md_file.name}" resources.append( Resource( uri=uri, title=title, description="Local markdown example (not yet ingested)", ) ) except Exception: continue return resources def query_relevant_documents( self, query: str, resources: Optional[List[Resource]] = None ) -> List[Document]: """Perform vector similarity search returning rich ``Document`` objects. Args: query: Natural language query string. resources: Optional subset filter of ``Resource`` objects; if provided, only documents whose id/url appear in the list will be included. Returns: List of aggregated ``Document`` objects; each contains one or more ``Chunk`` instances (one per matched piece of content). Raises: RuntimeError: On underlying search errors. """ resources = resources or [] try: if not self.client: self._connect() # Get embeddings for the query query_embedding = self._get_embedding(query) # For Milvus Lite, use MilvusClient directly if self._is_milvus_lite(): # Perform vector search search_results = self.client.search( collection_name=self.collection_name, data=[query_embedding], anns_field=self.vector_field, param={"metric_type": "IP", "params": {"nprobe": 10}}, limit=self.top_k, output_fields=[ self.id_field, self.content_field, self.title_field, self.url_field, ], ) documents = {} for result_list in search_results: for result in result_list: entity = result.get("entity", {}) doc_id = entity.get(self.id_field, "") content = entity.get(self.content_field, "") title = entity.get(self.title_field, "") url = entity.get(self.url_field, "") score = result.get("distance", 0.0) # Skip if resource filtering is requested and this doc is not in the list if resources: doc_in_resources = False for resource in resources: if ( url and url in resource.uri ) or doc_id in resource.uri: doc_in_resources = True break if not doc_in_resources: continue # Create or update document if doc_id not in documents: documents[doc_id] = Document( id=doc_id, url=url, title=title, chunks=[] ) # Add chunk to document chunk = Chunk(content=content, similarity=score) documents[doc_id].chunks.append(chunk) return list(documents.values()) else: # For LangChain Milvus, use similarity search search_results = self.client.similarity_search_with_score( query=query, k=self.top_k ) documents = {} for doc, score in search_results: metadata = doc.metadata or {} doc_id = metadata.get(self.id_field, "") title = metadata.get(self.title_field, "") url = metadata.get(self.url_field, "") content = doc.page_content # Skip if resource filtering is requested and this doc is not in the list if resources: doc_in_resources = False for resource in resources: if (url and url in resource.uri) or doc_id in resource.uri: doc_in_resources = True break if not doc_in_resources: continue # Create or update document if doc_id not in documents: documents[doc_id] = Document( id=doc_id, url=url, title=title, chunks=[] ) # Add chunk to document chunk = Chunk(content=content, similarity=score) documents[doc_id].chunks.append(chunk) return list(documents.values()) except Exception as e: raise RuntimeError(f"Failed to query documents from Milvus: {str(e)}") def create_collection(self) -> None: """Public hook ensuring collection exists (explicit initialization).""" if not self.client: self._connect() else: # If we're using Milvus Lite, ensure collection exists if self._is_milvus_lite(): self._ensure_collection_exists() def load_examples(self, force_reload: bool = False) -> None: """Load example markdown files, optionally clearing existing ones. Args: force_reload: If True existing example documents are deleted first. """ if not self.client: self._connect() if force_reload: # Clear existing examples self._clear_example_documents() self._load_example_files() def _clear_example_documents(self) -> None: """Delete previously ingested example documents (Milvus Lite only).""" try: if self._is_milvus_lite(): # For Milvus Lite, delete documents with source='examples' # Note: Milvus doesn't support direct delete by filter in all versions # So we'll query and delete by IDs results = self.client.query( collection_name=self.collection_name, filter="source == 'examples'", output_fields=[self.id_field], limit=10000, ) if results: doc_ids = [result[self.id_field] for result in results] self.client.delete( collection_name=self.collection_name, ids=doc_ids ) logger.info("Cleared %d existing example documents", len(doc_ids)) else: # For LangChain Milvus, we can't easily delete by metadata logger.info( "Clearing existing examples not supported for LangChain Milvus client" ) except Exception as e: logger.warning("Could not clear existing examples: %s", e) def get_loaded_examples(self) -> List[Dict[str, str]]: """Return metadata for previously ingested example documents.""" try: if not self.client: self._connect() if self._is_milvus_lite(): results = self.client.query( collection_name=self.collection_name, filter="source == 'examples'", output_fields=[ self.id_field, self.title_field, self.url_field, "source", "file", ], limit=1000, ) examples = [] for result in results: examples.append( { "id": result.get(self.id_field, ""), "title": result.get(self.title_field, ""), "file": result.get("file", ""), "url": result.get(self.url_field, ""), } ) return examples else: # For LangChain Milvus, we can't easily filter by metadata logger.info( "Getting loaded examples not supported for LangChain Milvus client" ) return [] except Exception as e: logger.error("Error getting loaded examples: %s", e) return [] def close(self) -> None: """Release underlying client resources (idempotent).""" if hasattr(self, "client") and self.client: try: # For Milvus Lite (MilvusClient), close the connection if self._is_milvus_lite() and hasattr(self.client, "close"): self.client.close() # For LangChain Milvus, no explicit close method needed self.client = None except Exception: # Ignore errors during cleanup pass def _sanitize_filename(self, filename: str, max_length: int = 200) -> str: """Sanitize filename for safe use in doc_id and URI construction. Args: filename: Original filename to sanitize. max_length: Maximum allowed length for the filename (default: 200). Returns: Sanitized filename safe for storage and URI construction. """ # Extract basename to remove any path components sanitized = Path(filename).name # Remove or replace problematic characters # Keep alphanumeric, dots, hyphens, underscores; replace others with underscore sanitized = re.sub(r"[^\w.\-]", "_", sanitized) # Collapse multiple underscores sanitized = re.sub(r"_+", "_", sanitized) # Remove leading/trailing underscores and dots sanitized = sanitized.strip("_.") # Ensure we have a valid filename if not sanitized: sanitized = "unnamed_file" # Truncate if too long, preserving extension if len(sanitized) > max_length: # Try to preserve extension parts = sanitized.rsplit(".", 1) if len(parts) == 2 and len(parts[1]) <= 10: ext = "." + parts[1] base = parts[0][: max_length - len(ext)] sanitized = base + ext else: sanitized = sanitized[:max_length] return sanitized def _check_duplicate_file(self, filename: str) -> bool: """Check if a file with the same name has been uploaded before.""" try: if self._is_milvus_lite(): results = self.client.query( collection_name=self.collection_name, filter=f"file == '{filename}' and source == 'uploaded'", output_fields=[self.id_field], limit=1, ) return len(results) > 0 else: # For LangChain Milvus, perform a search with metadata filter docs = self.client.similarity_search( "", k=1, expr=f"file == '{filename}' and source == 'uploaded'", ) return len(docs) > 0 except Exception: # If check fails, allow upload to proceed return False def ingest_file(self, file_content: bytes, filename: str, **kwargs) -> Resource: """Ingest a file into the Milvus vector store for RAG retrieval. This method processes an uploaded file, splits it into chunks if necessary, generates embeddings, and stores them in the configured Milvus collection. Args: file_content: Raw bytes of the file to ingest. Must be valid UTF-8 encoded text content (e.g., markdown or plain text files). filename: Original filename. Used for title extraction, metadata storage, and URI construction. The filename is sanitized to remove special characters and path separators before use. **kwargs: Reserved for future use. Currently unused but accepted for forward compatibility (e.g., custom metadata, chunking options). Returns: Resource: Object containing: - uri: Milvus URI in format ``milvus://{collection}/{filename}`` - title: Extracted from first markdown heading or derived from filename - description: "Uploaded file" or "Uploaded file (new version)" Raises: ValueError: If file_content cannot be decoded as UTF-8 text. This typically occurs when attempting to upload binary files (images, PDFs, etc.) which are not supported. RuntimeError: If document chunk insertion fails due to embedding generation errors, Milvus connection issues, or storage failures. ConnectionError: If unable to establish connection to Milvus server. Supported file types: - Markdown files (.md): Title extracted from first ``# heading`` - Plain text files (.txt): Title derived from filename Duplicate handling: Files with the same name can be uploaded multiple times. Each upload creates a new document with a unique ID (includes timestamp). The description field indicates if this is a new version of an existing file. Old versions are retained in storage. Example: >>> retriever = MilvusRetriever() >>> with open("document.md", "rb") as f: ... resource = retriever.ingest_file(f.read(), "document.md") >>> print(resource.uri) milvus://documents/document.md """ # Check connection if not self.client: self._connect() # Sanitize filename to prevent issues with special characters and path traversal safe_filename = self._sanitize_filename(filename) if safe_filename != filename: logger.debug( "Filename sanitized: '%s' -> '%s'", filename, safe_filename ) # Decode content (only UTF-8 text files supported) try: content = file_content.decode("utf-8") except UnicodeDecodeError: raise ValueError( "Only UTF-8 encoded text files are supported (e.g., .md, .txt). " "Binary files such as images, PDFs, or Word documents cannot be processed." ) # Check for existing file with same name is_duplicate = self._check_duplicate_file(safe_filename) if is_duplicate: logger.info( "File '%s' was previously uploaded. Creating new version.", safe_filename ) # Generate unique doc_id using filename, content length, and timestamp # Timestamp ensures uniqueness even for identical re-uploads timestamp = int(time.time() * 1000) # millisecond precision content_hash = hashlib.md5( f"{safe_filename}_{len(content)}_{timestamp}".encode() ).hexdigest()[:8] base_name = safe_filename.rsplit(".", 1)[0] if "." in safe_filename else safe_filename doc_id = f"uploaded_{base_name}_{content_hash}" title = self._extract_title_from_markdown(content, safe_filename) chunks = self._split_content(content) # Insert chunks for i, chunk in enumerate(chunks): chunk_id = f"{doc_id}_chunk_{i}" if len(chunks) > 1 else doc_id self._insert_document_chunk( doc_id=chunk_id, content=chunk, title=title, url=f"milvus://{self.collection_name}/{safe_filename}", metadata={"source": "uploaded", "file": safe_filename, "timestamp": timestamp}, ) description = "Uploaded file (new version)" if is_duplicate else "Uploaded file" return Resource( uri=f"milvus://{self.collection_name}/{safe_filename}", title=title, description=description, ) def __del__(self) -> None: # pragma: no cover - best-effort cleanup """Best-effort cleanup when instance is garbage collected.""" self.close() # Backwards compatibility export (original class name kept for external imports) class MilvusProvider(MilvusRetriever): """Backward compatible alias for ``MilvusRetriever`` (original name).""" pass def load_examples() -> None: auto_load_examples = get_bool_env("MILVUS_AUTO_LOAD_EXAMPLES", False) rag_provider = get_str_env("RAG_PROVIDER", "") if rag_provider == "milvus" and auto_load_examples: provider = MilvusProvider() provider.load_examples()