deerflow2/src/eval/evaluator.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

"""
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
"""

import logging
from dataclasses import dataclass
from typing import Any, Dict, Optional

from .llm_judge import EvaluationResult, LLMJudge
from .metrics import ReportMetrics, compute_metrics, get_word_count_target

logger = logging.getLogger(__name__)


@dataclass
class CombinedEvaluation:
    """Combined evaluation results from metrics and LLM judge."""

    metrics: ReportMetrics
    llm_evaluation: Optional[EvaluationResult]
    final_score: float
    grade: str
    summary: str

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary format."""
        return {
            "metrics": self.metrics.to_dict(),
            "llm_evaluation": (
                self.llm_evaluation.to_dict() if self.llm_evaluation else None
            ),
            "final_score": self.final_score,
            "grade": self.grade,
            "summary": self.summary,
        }


def score_to_grade(score: float) -> str:
    """Convert numeric score to letter grade."""
    if score >= 9.0:
        return "A+"
    elif score >= 8.5:
        return "A"
    elif score >= 8.0:
        return "A-"
    elif score >= 7.5:
        return "B+"
    elif score >= 7.0:
        return "B"
    elif score >= 6.5:
        return "B-"
    elif score >= 6.0:
        return "C+"
    elif score >= 5.5:
        return "C"
    elif score >= 5.0:
        return "C-"
    elif score >= 4.0:
        return "D"
    else:
        return "F"


class ReportEvaluator:
    """
    Combined report evaluator using both automated metrics and LLM-as-Judge.

    This evaluator provides comprehensive report quality assessment by:
    1. Computing automated metrics (fast, deterministic)
    2. Running LLM-based evaluation (nuanced, contextual)
    3. Combining both for a final score and grade
    """

    def __init__(self, llm: Any = None, use_llm: bool = True):
        """
        Initialize the evaluator.

        Args:
            llm: Optional LLM instance for LLM-as-Judge evaluation
            use_llm: Whether to use LLM evaluation (can be disabled for speed)
        """
        self.use_llm = use_llm
        self.llm_judge = LLMJudge(llm=llm) if use_llm else None

    def _compute_metrics_score(
        self, metrics: ReportMetrics, report_style: str
    ) -> float:
        """
        Convert automated metrics to a 0-10 score.

        Scoring breakdown:
        - Section coverage: 30%
        - Citation quality: 25%
        - Word count compliance: 20%
        - Source diversity: 15%
        - Image inclusion: 10%
        """
        score = 0.0

        section_score = metrics.section_coverage_score * 10
        score += section_score * 0.30

        citation_score = min(metrics.citation_count / 10, 1.0) * 10
        score += citation_score * 0.25

        target = get_word_count_target(report_style)
        if target:
            if target["min"] <= metrics.word_count <= target["max"]:
                word_score = 10.0
            elif metrics.word_count < target["min"]:
                word_score = (metrics.word_count / target["min"]) * 8
            else:
                excess_ratio = metrics.word_count / target["max"]
                word_score = max(10 - (excess_ratio - 1) * 5, 5)
            score += word_score * 0.20

        diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
        score += diversity_score * 0.15

        image_score = min(metrics.image_count / 3, 1.0) * 10
        score += image_score * 0.10

        return round(score, 2)

    def _generate_summary(
        self,
        metrics: ReportMetrics,
        llm_eval: Optional[EvaluationResult],
        final_score: float,
        grade: str,
    ) -> str:
        """Generate a human-readable evaluation summary."""
        lines = [f"Report Grade: {grade} ({final_score}/10)", ""]

        lines.append("**Automated Metrics:**")
        lines.append(f"- Word Count: {metrics.word_count}")
        lines.append(f"- Citations: {metrics.citation_count}")
        lines.append(f"- Unique Sources: {metrics.unique_sources}")
        lines.append(f"- Images: {metrics.image_count}")
        lines.append(
            f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
        )

        if metrics.sections_missing:
            lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")

        if llm_eval:
            lines.append("")
            lines.append("**LLM Evaluation:**")
            for criterion, score in llm_eval.scores.items():
                lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")

            if llm_eval.strengths:
                lines.append("")
                lines.append("**Strengths:**")
                for strength in llm_eval.strengths[:3]:
                    lines.append(f"- {strength}")

            if llm_eval.weaknesses:
                lines.append("")
                lines.append("**Areas for Improvement:**")
                for weakness in llm_eval.weaknesses[:3]:
                    lines.append(f"- {weakness}")

        return "\n".join(lines)

    async def evaluate(
        self,
        report: str,
        query: str,
        report_style: str = "default",
    ) -> CombinedEvaluation:
        """
        Evaluate a report using both metrics and LLM.

        Args:
            report: The report text to evaluate
            query: The original research query
            report_style: The style of report

        Returns:
            CombinedEvaluation with full results
        """
        metrics = compute_metrics(report, report_style)
        metrics_score = self._compute_metrics_score(metrics, report_style)

        llm_eval = None
        if self.use_llm and self.llm_judge:
            try:
                llm_eval = await self.llm_judge.evaluate(report, query, report_style)
            except Exception as e:
                logger.warning(f"LLM evaluation failed, using metrics only: {e}")

        if llm_eval and llm_eval.overall_score > 0:
            final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
        else:
            final_score = metrics_score

        final_score = round(final_score, 2)
        grade = score_to_grade(final_score)

        summary = self._generate_summary(metrics, llm_eval, final_score, grade)

        return CombinedEvaluation(
            metrics=metrics,
            llm_evaluation=llm_eval,
            final_score=final_score,
            grade=grade,
            summary=summary,
        )

    def evaluate_sync(
        self,
        report: str,
        query: str,
        report_style: str = "default",
    ) -> CombinedEvaluation:
        """Synchronous version of evaluate."""
        import asyncio

        return asyncio.run(self.evaluate(report, query, report_style))

    def evaluate_metrics_only(
        self,
        report: str,
        report_style: str = "default",
    ) -> Dict[str, Any]:
        """
        Quick evaluation using only automated metrics (no LLM).

        Args:
            report: The report text to evaluate
            report_style: The style of report

        Returns:
            Dictionary with metrics and score
        """
        metrics = compute_metrics(report, report_style)
        metrics_score = self._compute_metrics_score(metrics, report_style)
        grade = score_to_grade(metrics_score)

        return {
            "metrics": metrics.to_dict(),
            "score": metrics_score,
            "grade": grade,
        }