deerflow2/src/citations/formatter.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

"""
Citation formatter for generating citation sections and inline references.
"""

import re
from typing import Any, Dict, List

from .models import Citation


class CitationFormatter:
    """
    Formats citations for display in reports.

    Supports multiple citation styles:
    - numbered: [1], [2], etc.
    - superscript: ¹, ², etc.
    - footnote: [^1], [^2], etc.
    - inline: (Author, Year) or (Source)
    """

    SUPERSCRIPT_MAP = {
        "0": "⁰",
        "1": "¹",
        "2": "²",
        "3": "³",
        "4": "⁴",
        "5": "⁵",
        "6": "⁶",
        "7": "⁷",
        "8": "⁸",
        "9": "⁹",
    }

    def __init__(self, style: str = "numbered"):
        """
        Initialize the formatter.

        Args:
            style: Citation style ('numbered', 'superscript', 'footnote', 'inline')
        """
        self.style = style

    def format_inline_marker(self, number: int) -> str:
        """
        Format an inline citation marker.

        Args:
            number: The citation number

        Returns:
            Formatted marker string
        """
        if self.style == "superscript":
            return "".join(self.SUPERSCRIPT_MAP.get(c, c) for c in str(number))
        elif self.style == "footnote":
            return f"[^{number}]"
        else:  # numbered
            return f"[{number}]"

    def format_reference(self, citation: Citation) -> str:
        """
        Format a single reference for the citations section.

        Args:
            citation: The citation to format

        Returns:
            Formatted reference string
        """
        metadata = citation.metadata

        # Build reference with available metadata
        parts = []

        # Number and title
        parts.append(f"[{citation.number}] **{metadata.title}**")

        # Author if available
        if metadata.author:
            parts.append(f"   *{metadata.author}*")

        # Domain/source
        if metadata.domain:
            parts.append(f"   Source: {metadata.domain}")

        # Published date if available
        if metadata.published_date:
            parts.append(f"   Published: {metadata.published_date}")

        # URL
        parts.append(f"   URL: {metadata.url}")

        # Description/snippet
        if metadata.description:
            snippet = metadata.description[:200]
            if len(metadata.description) > 200:
                snippet += "..."
            parts.append(f"   > {snippet}")

        return "\n".join(parts)

    def format_simple_reference(self, citation: Citation) -> str:
        """
        Format a simple reference (title + URL).

        Args:
            citation: The citation to format

        Returns:
            Simple reference string
        """
        return f"- [{citation.metadata.title}]({citation.metadata.url})"

    def format_rich_reference(self, citation: Citation) -> str:
        """
        Format a rich reference with metadata as JSON-like annotation.

        Args:
            citation: The citation to format

        Returns:
            Rich reference string with metadata
        """
        metadata = citation.metadata
        parts = [f"- [{metadata.title}]({metadata.url})"]

        annotations = []
        if metadata.domain:
            annotations.append(f"domain: {metadata.domain}")
        if metadata.relevance_score > 0:
            annotations.append(f"relevance: {metadata.relevance_score:.2f}")
        if metadata.accessed_at:
            annotations.append(f"accessed: {metadata.accessed_at[:10]}")

        if annotations:
            parts.append(f"  <!-- {', '.join(annotations)} -->")

        return "\n".join(parts)

    def format_citations_section(
        self, citations: List[Citation], include_metadata: bool = True
    ) -> str:
        """
        Format the full citations section for a report.

        Args:
            citations: List of citations to include
            include_metadata: Whether to include rich metadata

        Returns:
            Formatted citations section markdown
        """
        if not citations:
            return ""

        lines = ["## Key Citations", ""]

        for citation in citations:
            if include_metadata:
                lines.append(self.format_rich_reference(citation))
            else:
                lines.append(self.format_simple_reference(citation))
            lines.append("")  # Empty line between citations

        return "\n".join(lines)

    def format_footnotes_section(self, citations: List[Citation]) -> str:
        """
        Format citations as footnotes (for footnote style).

        Args:
            citations: List of citations

        Returns:
            Footnotes section markdown
        """
        if not citations:
            return ""

        lines = ["", "---", ""]
        for citation in citations:
            lines.append(
                f"[^{citation.number}]: {citation.metadata.title} - {citation.metadata.url}"
            )

        return "\n".join(lines)

    def add_citation_markers_to_text(
        self, text: str, citations: List[Citation], url_to_number: Dict[str, int]
    ) -> str:
        """
        Add citation markers to text where URLs are referenced.

        Args:
            text: The text to process
            citations: Available citations
            url_to_number: Mapping from URL to citation number

        Returns:
            Text with citation markers added
        """

        # Find all markdown links and add citation numbers
        def replace_link(match):
            full_match = match.group(0)
            url = match.group(2)

            if url in url_to_number:
                number = url_to_number[url]
                marker = self.format_inline_marker(number)
                return f"{full_match}{marker}"
            return full_match

        pattern = r"\[([^\]]+)\]\(([^)]+)\)"
        return re.sub(pattern, replace_link, text)

    @staticmethod
    def build_citation_data_json(citations: List[Citation]) -> str:
        """
        Build a JSON block containing citation data for frontend use.

        Args:
            citations: List of citations

        Returns:
            JSON string with citation data
        """
        import json

        data = {
            "citations": [c.to_dict() for c in citations],
            "count": len(citations),
        }

        return json.dumps(data, ensure_ascii=False)


def parse_citations_from_report(
    report: str, section_patterns: List[str] = None
) -> Dict[str, Any]:
    """
    Extract citation information from report, supporting multiple formats.

    Supports various citation formats:
    - Markdown: [Title](URL)
    - Numbered: [1] Title - URL
    - Footnote: [^1]: Title - URL
    - HTML: <a href="URL">Title</a>

    Args:
        report: The report markdown text
        section_patterns: Custom section header patterns (optional)

    Returns:
        Dictionary with 'citations' list and 'count' of unique citations
    """
    if section_patterns is None:
        section_patterns = [
            r"(?:##\s*Key Citations|##\s*References|##\s*Sources|##\s*Bibliography)",
        ]

    citations = []

    # 1. Find citation section and extract citations
    for pattern in section_patterns:
        # Use a more efficient pattern that matches line-by-line content
        # instead of relying on dotall with greedy matching for large reports
        section_matches = re.finditer(
            pattern + r"\s*\n((?:(?!\n##).*\n?)*)",
            report,
            re.IGNORECASE | re.MULTILINE,
        )

        for section_match in section_matches:
            section = section_match.group(1)

            # 2. Extract citations in various formats
            citations.extend(_extract_markdown_links(section))
            citations.extend(_extract_numbered_citations(section))
            citations.extend(_extract_footnote_citations(section))
            citations.extend(_extract_html_links(section))

    # 3. Deduplicate by URL
    unique_citations = {}
    for citation in citations:
        url = citation.get("url", "")
        if url and url not in unique_citations:
            unique_citations[url] = citation

    return {
        "citations": list(unique_citations.values()),
        "count": len(unique_citations),
    }


def _extract_markdown_links(text: str) -> List[Dict[str, str]]:
    """
    Extract Markdown links [title](url).

    Args:
        text: Text to extract from

    Returns:
        List of citation dictionaries with title, url, and format
    """
    citations = []
    pattern = r"\[([^\]]+)\]\(([^)]+)\)"

    for match in re.finditer(pattern, text):
        title, url = match.groups()
        if url.startswith(("http://", "https://")):
            citations.append({
                "title": title.strip(),
                "url": url.strip(),
                "format": "markdown",
            })

    return citations


def _extract_numbered_citations(text: str) -> List[Dict[str, str]]:
    """
    Extract numbered citations [1] Title - URL.

    Args:
        text: Text to extract from

    Returns:
        List of citation dictionaries
    """
    citations = []
    # Match: [number] title - URL
    pattern = r"\[\d+\]\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"

    for match in re.finditer(pattern, text):
        title, url = match.groups()
        citations.append({
            "title": title.strip(),
            "url": url.strip(),
            "format": "numbered",
        })

    return citations


def _extract_footnote_citations(text: str) -> List[Dict[str, str]]:
    """
    Extract footnote citations [^1]: Title - URL.

    Args:
        text: Text to extract from

    Returns:
        List of citation dictionaries
    """
    citations = []
    # Match: [^number]: title - URL
    pattern = r"\[\^(\d+)\]:\s+([^-\n]+?)\s*-\s*(https?://[^\s\n]+)"

    for match in re.finditer(pattern, text):
        _, title, url = match.groups()
        citations.append({
            "title": title.strip(),
            "url": url.strip(),
            "format": "footnote",
        })

    return citations


def _extract_html_links(text: str) -> List[Dict[str, str]]:
    """
    Extract HTML links <a href="url">title</a>.

    Args:
        text: Text to extract from

    Returns:
        List of citation dictionaries
    """
    citations = []
    pattern = r'<a\s+(?:[^>]*?\s)?href=(["\'])([^"\']+)\1[^>]*>([^<]+)</a>'

    for match in re.finditer(pattern, text, re.IGNORECASE):
        _, url, title = match.groups()
        if url.startswith(("http://", "https://")):
            citations.append({
                "title": title.strip(),
                "url": url.strip(),
                "format": "html",
            })

    return citations