deerflow2/src/citations/formatter.py

398 lines
11 KiB
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
"""
Citation formatter for generating citation sections and inline references.
"""
import re
from typing import Any, Dict, List
from .models import Citation
class CitationFormatter:
"""
Formats citations for display in reports.
Supports multiple citation styles:
- numbered: [1], [2], etc.
- superscript: ¹, ², etc.
- footnote: [^1], [^2], etc.
- inline: (Author, Year) or (Source)
"""
SUPERSCRIPT_MAP = {
"0": "",
"1": "¹",
"2": "²",
"3": "³",
"4": "",
"5": "",
"6": "",
"7": "",
"8": "",
"9": "",
}
def __init__(self, style: str = "numbered"):
"""
Initialize the formatter.
Args:
style: Citation style ('numbered', 'superscript', 'footnote', 'inline')
"""
self.style = style
def format_inline_marker(self, number: int) -> str:
"""
Format an inline citation marker.
Args:
number: The citation number
Returns:
Formatted marker string
"""
if self.style == "superscript":
return "".join(self.SUPERSCRIPT_MAP.get(c, c) for c in str(number))
elif self.style == "footnote":
return f"[^{number}]"
else: # numbered
return f"[{number}]"
def format_reference(self, citation: Citation) -> str:
"""
Format a single reference for the citations section.
Args:
citation: The citation to format
Returns:
Formatted reference string
"""
metadata = citation.metadata
# Build reference with available metadata
parts = []
# Number and title
parts.append(f"[{citation.number}] **{metadata.title}**")
# Author if available
if metadata.author:
parts.append(f" *{metadata.author}*")
# Domain/source
if metadata.domain:
parts.append(f" Source: {metadata.domain}")
# Published date if available
if metadata.published_date:
parts.append(f" Published: {metadata.published_date}")
# URL
parts.append(f" URL: {metadata.url}")
# Description/snippet
if metadata.description:
snippet = metadata.description[:200]
if len(metadata.description) > 200:
snippet += "..."
parts.append(f" > {snippet}")
return "\n".join(parts)
def format_simple_reference(self, citation: Citation) -> str:
"""
Format a simple reference (title + URL).
Args:
citation: The citation to format
Returns:
Simple reference string
"""
return f"- [{citation.metadata.title}]({citation.metadata.url})"
def format_rich_reference(self, citation: Citation) -> str:
"""
Format a rich reference with metadata as JSON-like annotation.
Args:
citation: The citation to format
Returns:
Rich reference string with metadata
"""
metadata = citation.metadata
parts = [f"- [{metadata.title}]({metadata.url})"]
annotations = []
if metadata.domain:
annotations.append(f"domain: {metadata.domain}")
if metadata.relevance_score > 0:
annotations.append(f"relevance: {metadata.relevance_score:.2f}")
if metadata.accessed_at:
annotations.append(f"accessed: {metadata.accessed_at[:10]}")
if annotations:
parts.append(f" <!-- {', '.join(annotations)} -->")
return "\n".join(parts)
def format_citations_section(
self, citations: List[Citation], include_metadata: bool = True
) -> str:
"""
Format the full citations section for a report.
Args:
citations: List of citations to include
include_metadata: Whether to include rich metadata
Returns:
Formatted citations section markdown
"""
if not citations:
return ""
lines = ["## Key Citations", ""]
for citation in citations:
if include_metadata:
lines.append(self.format_rich_reference(citation))
else:
lines.append(self.format_simple_reference(citation))
lines.append("") # Empty line between citations
return "\n".join(lines)
def format_footnotes_section(self, citations: List[Citation]) -> str:
"""
Format citations as footnotes (for footnote style).
Args:
citations: List of citations
Returns:
Footnotes section markdown
"""
if not citations:
return ""
lines = ["", "---", ""]
for citation in citations:
lines.append(
f"[^{citation.number}]: {citation.metadata.title} - {citation.metadata.url}"
)
return "\n".join(lines)
def add_citation_markers_to_text(
self, text: str, citations: List[Citation], url_to_number: Dict[str, int]
) -> str:
"""
Add citation markers to text where URLs are referenced.
Args:
text: The text to process
citations: Available citations
url_to_number: Mapping from URL to citation number
Returns:
Text with citation markers added
"""
# Find all markdown links and add citation numbers
def replace_link(match):
full_match = match.group(0)
url = match.group(2)
if url in url_to_number:
number = url_to_number[url]
marker = self.format_inline_marker(number)
return f"{full_match}{marker}"
return full_match
pattern = r"\[([^\]]+)\]\(([^)]+)\)"
return re.sub(pattern, replace_link, text)
@staticmethod
def build_citation_data_json(citations: List[Citation]) -> str:
"""
Build a JSON block containing citation data for frontend use.
Args:
citations: List of citations
Returns:
JSON string with citation data
"""
import json
data = {
"citations": [c.to_dict() for c in citations],
"count": len(citations),
}
return json.dumps(data, ensure_ascii=False)
def parse_citations_from_report(
report: str, section_patterns: List[str] = None
) -> Dict[str, Any]:
"""
Extract citation information from report, supporting multiple formats.
Supports various citation formats:
- Markdown: [Title](URL)
- Numbered: [1] Title - URL
- Footnote: [^1]: Title - URL
- HTML: <a href="URL">Title</a>
Args:
report: The report markdown text
section_patterns: Custom section header patterns (optional)
Returns:
Dictionary with 'citations' list and 'count' of unique citations
"""
if section_patterns is None:
section_patterns = [
r"(?:##\s*Key Citations|##\s*References|##\s*Sources|##\s*Bibliography)",
]
citations = []
# 1. Find citation section and extract citations
for pattern in section_patterns:
# Use a more efficient pattern that matches line-by-line content
# instead of relying on dotall with greedy matching for large reports
section_matches = re.finditer(
pattern + r"\s*\n((?:(?!\n##).*\n?)*)",
report,
re.IGNORECASE | re.MULTILINE,
)
for section_match in section_matches:
section = section_match.group(1)
# 2. Extract citations in various formats
citations.extend(_extract_markdown_links(section))
citations.extend(_extract_numbered_citations(section))
citations.extend(_extract_footnote_citations(section))
citations.extend(_extract_html_links(section))
# 3. Deduplicate by URL
unique_citations = {}
for citation in citations:
url = citation.get("url", "")
if url and url not in unique_citations:
unique_citations[url] = citation
return {
"citations": list(unique_citations.values()),
"count": len(unique_citations),
}
def _extract_markdown_links(text: str) -> List[Dict[str, str]]:
"""
Extract Markdown links [title](url).
Args:
text: Text to extract from
Returns:
List of citation dictionaries with title, url, and format
"""
citations = []
pattern = r"\[([^\]]+)\]\(([^)]+)\)"
for match in re.finditer(pattern, text):
title, url = match.groups()
if url.startswith(("http://", "https://")):
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "markdown",
})
return citations
def _extract_numbered_citations(text: str) -> List[Dict[str, str]]:
"""
Extract numbered citations [1] Title - URL.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
# Match: [number] title - URL
pattern = r"\[\d+\]\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
for match in re.finditer(pattern, text):
title, url = match.groups()
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "numbered",
})
return citations
def _extract_footnote_citations(text: str) -> List[Dict[str, str]]:
"""
Extract footnote citations [^1]: Title - URL.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
# Match: [^number]: title - URL
pattern = r"\[\^(\d+)\]:\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"
for match in re.finditer(pattern, text):
_, title, url = match.groups()
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "footnote",
})
return citations
def _extract_html_links(text: str) -> List[Dict[str, str]]:
"""
Extract HTML links <a href="url">title</a>.
Args:
text: Text to extract from
Returns:
List of citation dictionaries
"""
citations = []
pattern = r'<a\s+(?:[^>]*?\s)?href=(["\'])([^"\']+)\1[^>]*>([^<]+)</a>'
for match in re.finditer(pattern, text, re.IGNORECASE):
_, url, title = match.groups()
if url.startswith(("http://", "https://")):
citations.append({
"title": title.strip(),
"url": url.strip(),
"format": "html",
})
return citations