# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates # SPDX-License-Identifier: MIT """ Combined report evaluator orchestrating both automated metrics and LLM evaluation. """ import logging from dataclasses import dataclass from typing import Any, Dict, Optional from .llm_judge import EvaluationResult, LLMJudge from .metrics import ReportMetrics, compute_metrics, get_word_count_target logger = logging.getLogger(__name__) @dataclass class CombinedEvaluation: """Combined evaluation results from metrics and LLM judge.""" metrics: ReportMetrics llm_evaluation: Optional[EvaluationResult] final_score: float grade: str summary: str def to_dict(self) -> Dict[str, Any]: """Convert to dictionary format.""" return { "metrics": self.metrics.to_dict(), "llm_evaluation": ( self.llm_evaluation.to_dict() if self.llm_evaluation else None ), "final_score": self.final_score, "grade": self.grade, "summary": self.summary, } def score_to_grade(score: float) -> str: """Convert numeric score to letter grade.""" if score >= 9.0: return "A+" elif score >= 8.5: return "A" elif score >= 8.0: return "A-" elif score >= 7.5: return "B+" elif score >= 7.0: return "B" elif score >= 6.5: return "B-" elif score >= 6.0: return "C+" elif score >= 5.5: return "C" elif score >= 5.0: return "C-" elif score >= 4.0: return "D" else: return "F" class ReportEvaluator: """ Combined report evaluator using both automated metrics and LLM-as-Judge. This evaluator provides comprehensive report quality assessment by: 1. Computing automated metrics (fast, deterministic) 2. Running LLM-based evaluation (nuanced, contextual) 3. Combining both for a final score and grade """ def __init__(self, llm: Any = None, use_llm: bool = True): """ Initialize the evaluator. Args: llm: Optional LLM instance for LLM-as-Judge evaluation use_llm: Whether to use LLM evaluation (can be disabled for speed) """ self.use_llm = use_llm self.llm_judge = LLMJudge(llm=llm) if use_llm else None def _compute_metrics_score( self, metrics: ReportMetrics, report_style: str ) -> float: """ Convert automated metrics to a 0-10 score. Scoring breakdown: - Section coverage: 30% - Citation quality: 25% - Word count compliance: 20% - Source diversity: 15% - Image inclusion: 10% """ score = 0.0 section_score = metrics.section_coverage_score * 10 score += section_score * 0.30 citation_score = min(metrics.citation_count / 10, 1.0) * 10 score += citation_score * 0.25 target = get_word_count_target(report_style) if target: if target["min"] <= metrics.word_count <= target["max"]: word_score = 10.0 elif metrics.word_count < target["min"]: word_score = (metrics.word_count / target["min"]) * 8 else: excess_ratio = metrics.word_count / target["max"] word_score = max(10 - (excess_ratio - 1) * 5, 5) score += word_score * 0.20 diversity_score = min(metrics.unique_sources / 5, 1.0) * 10 score += diversity_score * 0.15 image_score = min(metrics.image_count / 3, 1.0) * 10 score += image_score * 0.10 return round(score, 2) def _generate_summary( self, metrics: ReportMetrics, llm_eval: Optional[EvaluationResult], final_score: float, grade: str, ) -> str: """Generate a human-readable evaluation summary.""" lines = [f"Report Grade: {grade} ({final_score}/10)", ""] lines.append("**Automated Metrics:**") lines.append(f"- Word Count: {metrics.word_count}") lines.append(f"- Citations: {metrics.citation_count}") lines.append(f"- Unique Sources: {metrics.unique_sources}") lines.append(f"- Images: {metrics.image_count}") lines.append( f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%" ) if metrics.sections_missing: lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}") if llm_eval: lines.append("") lines.append("**LLM Evaluation:**") for criterion, score in llm_eval.scores.items(): lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10") if llm_eval.strengths: lines.append("") lines.append("**Strengths:**") for strength in llm_eval.strengths[:3]: lines.append(f"- {strength}") if llm_eval.weaknesses: lines.append("") lines.append("**Areas for Improvement:**") for weakness in llm_eval.weaknesses[:3]: lines.append(f"- {weakness}") return "\n".join(lines) async def evaluate( self, report: str, query: str, report_style: str = "default", ) -> CombinedEvaluation: """ Evaluate a report using both metrics and LLM. Args: report: The report text to evaluate query: The original research query report_style: The style of report Returns: CombinedEvaluation with full results """ metrics = compute_metrics(report, report_style) metrics_score = self._compute_metrics_score(metrics, report_style) llm_eval = None if self.use_llm and self.llm_judge: try: llm_eval = await self.llm_judge.evaluate(report, query, report_style) except Exception as e: logger.warning(f"LLM evaluation failed, using metrics only: {e}") if llm_eval and llm_eval.overall_score > 0: final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6) else: final_score = metrics_score final_score = round(final_score, 2) grade = score_to_grade(final_score) summary = self._generate_summary(metrics, llm_eval, final_score, grade) return CombinedEvaluation( metrics=metrics, llm_evaluation=llm_eval, final_score=final_score, grade=grade, summary=summary, ) def evaluate_sync( self, report: str, query: str, report_style: str = "default", ) -> CombinedEvaluation: """Synchronous version of evaluate.""" import asyncio return asyncio.run(self.evaluate(report, query, report_style)) def evaluate_metrics_only( self, report: str, report_style: str = "default", ) -> Dict[str, Any]: """ Quick evaluation using only automated metrics (no LLM). Args: report: The report text to evaluate report_style: The style of report Returns: Dictionary with metrics and score """ metrics = compute_metrics(report, report_style) metrics_score = self._compute_metrics_score(metrics, report_style) grade = score_to_grade(metrics_score) return { "metrics": metrics.to_dict(), "score": metrics_score, "grade": grade, }