213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
# src/tools/search_postprocessor.py
|
|
import base64
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List
|
|
from urllib.parse import urlparse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SearchResultPostProcessor:
|
|
"""Search result post-processor"""
|
|
|
|
base64_pattern = r"data:image/[^;]+;base64,[a-zA-Z0-9+/=]+"
|
|
|
|
def __init__(self, min_score_threshold: float, max_content_length_per_page: int):
|
|
"""
|
|
Initialize the post-processor
|
|
|
|
Args:
|
|
min_score_threshold: Minimum relevance score threshold
|
|
max_content_length_per_page: Maximum content length
|
|
"""
|
|
self.min_score_threshold = min_score_threshold
|
|
self.max_content_length_per_page = max_content_length_per_page
|
|
|
|
def process_results(self, results: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Process search results
|
|
|
|
Args:
|
|
results: Original search result list
|
|
|
|
Returns:
|
|
Processed result list
|
|
"""
|
|
if not results:
|
|
return []
|
|
|
|
# Combined processing in a single loop for efficiency
|
|
cleaned_results = []
|
|
seen_urls = set()
|
|
|
|
for result in results:
|
|
# 1. Remove duplicates
|
|
cleaned_result = self._remove_duplicates(result, seen_urls)
|
|
if not cleaned_result:
|
|
continue
|
|
|
|
# 2. Filter low quality results
|
|
if (
|
|
"page" == cleaned_result.get("type")
|
|
and self.min_score_threshold
|
|
and self.min_score_threshold > 0
|
|
and cleaned_result.get("score", 0) < self.min_score_threshold
|
|
):
|
|
continue
|
|
|
|
# 3. Clean base64 images from content
|
|
cleaned_result = self._remove_base64_images(cleaned_result)
|
|
if not cleaned_result:
|
|
continue
|
|
|
|
# 4. When max_content_length_per_page is set, truncate long content
|
|
if (
|
|
self.max_content_length_per_page
|
|
and self.max_content_length_per_page > 0
|
|
):
|
|
cleaned_result = self._truncate_long_content(cleaned_result)
|
|
|
|
if cleaned_result:
|
|
cleaned_results.append(cleaned_result)
|
|
|
|
# 5. Sort (by score descending)
|
|
sorted_results = sorted(
|
|
cleaned_results, key=lambda x: x.get("score", 0), reverse=True
|
|
)
|
|
|
|
logger.info(
|
|
f"Search result post-processing: {len(results)} -> {len(sorted_results)}"
|
|
)
|
|
return sorted_results
|
|
|
|
def _remove_base64_images(self, result: Dict) -> Dict:
|
|
"""Remove base64 encoded images from content"""
|
|
|
|
if "page" == result.get("type"):
|
|
cleaned_result = self.processPage(result)
|
|
elif "image" == result.get("type"):
|
|
cleaned_result = self.processImage(result)
|
|
else:
|
|
# For other types, keep as is
|
|
cleaned_result = result.copy()
|
|
|
|
return cleaned_result
|
|
|
|
def processPage(self, result: Dict) -> Dict:
|
|
"""Process page type result"""
|
|
# Clean base64 images from content
|
|
cleaned_result = result.copy()
|
|
|
|
if "content" in result:
|
|
original_content = result["content"]
|
|
cleaned_content = re.sub(self.base64_pattern, " ", original_content)
|
|
cleaned_result["content"] = cleaned_content
|
|
|
|
# Log if significant content was removed
|
|
if len(cleaned_content) < len(original_content) * 0.8:
|
|
logger.debug(
|
|
f"Removed base64 images from search content: {result.get('url', 'unknown')}"
|
|
)
|
|
|
|
# Clean base64 images from raw content
|
|
if "raw_content" in cleaned_result:
|
|
original_raw_content = cleaned_result["raw_content"]
|
|
cleaned_raw_content = re.sub(self.base64_pattern, " ", original_raw_content)
|
|
cleaned_result["raw_content"] = cleaned_raw_content
|
|
|
|
# Log if significant content was removed
|
|
if len(cleaned_raw_content) < len(original_raw_content) * 0.8:
|
|
logger.debug(
|
|
f"Removed base64 images from search raw content: {result.get('url', 'unknown')}"
|
|
)
|
|
|
|
return cleaned_result
|
|
|
|
def processImage(self, result: Dict) -> Dict:
|
|
"""Process image type result - clean up base64 data and long fields"""
|
|
cleaned_result = result.copy()
|
|
|
|
# Remove base64 encoded data from image_url if present
|
|
if "image_url" in cleaned_result and isinstance(
|
|
cleaned_result["image_url"], str
|
|
):
|
|
# Check if image_url contains base64 data
|
|
if "data:image" in cleaned_result["image_url"]:
|
|
original_image_url = cleaned_result["image_url"]
|
|
cleaned_image_url = re.sub(self.base64_pattern, " ", original_image_url)
|
|
if len(cleaned_image_url) == 0 or not cleaned_image_url.startswith(
|
|
"http"
|
|
):
|
|
logger.debug(
|
|
f"Removed base64 data from image_url and the cleaned_image_url is empty or not start with http, origin image_url: {result.get('image_url', 'unknown')}"
|
|
)
|
|
return {}
|
|
cleaned_result["image_url"] = cleaned_image_url
|
|
logger.debug(
|
|
f"Removed base64 data from image_url: {result.get('image_url', 'unknown')}"
|
|
)
|
|
|
|
# Truncate very long image descriptions
|
|
if "image_description" in cleaned_result and isinstance(
|
|
cleaned_result["image_description"], str
|
|
):
|
|
if (
|
|
self.max_content_length_per_page
|
|
and len(cleaned_result["image_description"])
|
|
> self.max_content_length_per_page
|
|
):
|
|
cleaned_result["image_description"] = (
|
|
cleaned_result["image_description"][
|
|
: self.max_content_length_per_page
|
|
]
|
|
+ "..."
|
|
)
|
|
logger.info(
|
|
f"Truncated long image description from search result: {result.get('image_url', 'unknown')}"
|
|
)
|
|
|
|
return cleaned_result
|
|
|
|
def _truncate_long_content(self, result: Dict) -> Dict:
|
|
"""Truncate long content"""
|
|
|
|
truncated_result = result.copy()
|
|
|
|
# Truncate content length
|
|
if "content" in truncated_result:
|
|
content = truncated_result["content"]
|
|
if len(content) > self.max_content_length_per_page:
|
|
truncated_result["content"] = (
|
|
content[: self.max_content_length_per_page] + "..."
|
|
)
|
|
logger.info(
|
|
f"Truncated long content from search result: {result.get('url', 'unknown')}"
|
|
)
|
|
|
|
# Truncate raw content length (can be slightly longer)
|
|
if "raw_content" in truncated_result:
|
|
raw_content = truncated_result["raw_content"]
|
|
if len(raw_content) > self.max_content_length_per_page * 2:
|
|
truncated_result["raw_content"] = (
|
|
raw_content[: self.max_content_length_per_page * 2] + "..."
|
|
)
|
|
logger.info(
|
|
f"Truncated long raw content from search result: {result.get('url', 'unknown')}"
|
|
)
|
|
|
|
return truncated_result
|
|
|
|
def _remove_duplicates(self, result: Dict, seen_urls: set) -> Dict:
|
|
"""Remove duplicate results"""
|
|
|
|
url = result.get("url", result.get("image_url", ""))
|
|
if url and url not in seen_urls:
|
|
seen_urls.add(url)
|
|
return result.copy() # Return a copy to avoid modifying original
|
|
elif not url:
|
|
# Keep results with empty URLs
|
|
return result.copy() # Return a copy to avoid modifying original
|
|
|
|
return {} # Return empty dict for duplicates
|