250 lines
7.5 KiB
Python
250 lines
7.5 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Combined report evaluator orchestrating both automated metrics and LLM evaluation.
|
|
"""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict, Optional
|
|
|
|
from .llm_judge import EvaluationResult, LLMJudge
|
|
from .metrics import ReportMetrics, compute_metrics, get_word_count_target
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CombinedEvaluation:
|
|
"""Combined evaluation results from metrics and LLM judge."""
|
|
|
|
metrics: ReportMetrics
|
|
llm_evaluation: Optional[EvaluationResult]
|
|
final_score: float
|
|
grade: str
|
|
summary: str
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary format."""
|
|
return {
|
|
"metrics": self.metrics.to_dict(),
|
|
"llm_evaluation": (
|
|
self.llm_evaluation.to_dict() if self.llm_evaluation else None
|
|
),
|
|
"final_score": self.final_score,
|
|
"grade": self.grade,
|
|
"summary": self.summary,
|
|
}
|
|
|
|
|
|
def score_to_grade(score: float) -> str:
|
|
"""Convert numeric score to letter grade."""
|
|
if score >= 9.0:
|
|
return "A+"
|
|
elif score >= 8.5:
|
|
return "A"
|
|
elif score >= 8.0:
|
|
return "A-"
|
|
elif score >= 7.5:
|
|
return "B+"
|
|
elif score >= 7.0:
|
|
return "B"
|
|
elif score >= 6.5:
|
|
return "B-"
|
|
elif score >= 6.0:
|
|
return "C+"
|
|
elif score >= 5.5:
|
|
return "C"
|
|
elif score >= 5.0:
|
|
return "C-"
|
|
elif score >= 4.0:
|
|
return "D"
|
|
else:
|
|
return "F"
|
|
|
|
|
|
class ReportEvaluator:
|
|
"""
|
|
Combined report evaluator using both automated metrics and LLM-as-Judge.
|
|
|
|
This evaluator provides comprehensive report quality assessment by:
|
|
1. Computing automated metrics (fast, deterministic)
|
|
2. Running LLM-based evaluation (nuanced, contextual)
|
|
3. Combining both for a final score and grade
|
|
"""
|
|
|
|
def __init__(self, llm: Any = None, use_llm: bool = True):
|
|
"""
|
|
Initialize the evaluator.
|
|
|
|
Args:
|
|
llm: Optional LLM instance for LLM-as-Judge evaluation
|
|
use_llm: Whether to use LLM evaluation (can be disabled for speed)
|
|
"""
|
|
self.use_llm = use_llm
|
|
self.llm_judge = LLMJudge(llm=llm) if use_llm else None
|
|
|
|
def _compute_metrics_score(
|
|
self, metrics: ReportMetrics, report_style: str
|
|
) -> float:
|
|
"""
|
|
Convert automated metrics to a 0-10 score.
|
|
|
|
Scoring breakdown:
|
|
- Section coverage: 30%
|
|
- Citation quality: 25%
|
|
- Word count compliance: 20%
|
|
- Source diversity: 15%
|
|
- Image inclusion: 10%
|
|
"""
|
|
score = 0.0
|
|
|
|
section_score = metrics.section_coverage_score * 10
|
|
score += section_score * 0.30
|
|
|
|
citation_score = min(metrics.citation_count / 10, 1.0) * 10
|
|
score += citation_score * 0.25
|
|
|
|
target = get_word_count_target(report_style)
|
|
if target:
|
|
if target["min"] <= metrics.word_count <= target["max"]:
|
|
word_score = 10.0
|
|
elif metrics.word_count < target["min"]:
|
|
word_score = (metrics.word_count / target["min"]) * 8
|
|
else:
|
|
excess_ratio = metrics.word_count / target["max"]
|
|
word_score = max(10 - (excess_ratio - 1) * 5, 5)
|
|
score += word_score * 0.20
|
|
|
|
diversity_score = min(metrics.unique_sources / 5, 1.0) * 10
|
|
score += diversity_score * 0.15
|
|
|
|
image_score = min(metrics.image_count / 3, 1.0) * 10
|
|
score += image_score * 0.10
|
|
|
|
return round(score, 2)
|
|
|
|
def _generate_summary(
|
|
self,
|
|
metrics: ReportMetrics,
|
|
llm_eval: Optional[EvaluationResult],
|
|
final_score: float,
|
|
grade: str,
|
|
) -> str:
|
|
"""Generate a human-readable evaluation summary."""
|
|
lines = [f"Report Grade: {grade} ({final_score}/10)", ""]
|
|
|
|
lines.append("**Automated Metrics:**")
|
|
lines.append(f"- Word Count: {metrics.word_count}")
|
|
lines.append(f"- Citations: {metrics.citation_count}")
|
|
lines.append(f"- Unique Sources: {metrics.unique_sources}")
|
|
lines.append(f"- Images: {metrics.image_count}")
|
|
lines.append(
|
|
f"- Section Coverage: {metrics.section_coverage_score * 100:.0f}%"
|
|
)
|
|
|
|
if metrics.sections_missing:
|
|
lines.append(f"- Missing Sections: {', '.join(metrics.sections_missing)}")
|
|
|
|
if llm_eval:
|
|
lines.append("")
|
|
lines.append("**LLM Evaluation:**")
|
|
for criterion, score in llm_eval.scores.items():
|
|
lines.append(f"- {criterion.replace('_', ' ').title()}: {score}/10")
|
|
|
|
if llm_eval.strengths:
|
|
lines.append("")
|
|
lines.append("**Strengths:**")
|
|
for strength in llm_eval.strengths[:3]:
|
|
lines.append(f"- {strength}")
|
|
|
|
if llm_eval.weaknesses:
|
|
lines.append("")
|
|
lines.append("**Areas for Improvement:**")
|
|
for weakness in llm_eval.weaknesses[:3]:
|
|
lines.append(f"- {weakness}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
async def evaluate(
|
|
self,
|
|
report: str,
|
|
query: str,
|
|
report_style: str = "default",
|
|
) -> CombinedEvaluation:
|
|
"""
|
|
Evaluate a report using both metrics and LLM.
|
|
|
|
Args:
|
|
report: The report text to evaluate
|
|
query: The original research query
|
|
report_style: The style of report
|
|
|
|
Returns:
|
|
CombinedEvaluation with full results
|
|
"""
|
|
metrics = compute_metrics(report, report_style)
|
|
metrics_score = self._compute_metrics_score(metrics, report_style)
|
|
|
|
llm_eval = None
|
|
if self.use_llm and self.llm_judge:
|
|
try:
|
|
llm_eval = await self.llm_judge.evaluate(report, query, report_style)
|
|
except Exception as e:
|
|
logger.warning(f"LLM evaluation failed, using metrics only: {e}")
|
|
|
|
if llm_eval and llm_eval.overall_score > 0:
|
|
final_score = (metrics_score * 0.4) + (llm_eval.weighted_score * 0.6)
|
|
else:
|
|
final_score = metrics_score
|
|
|
|
final_score = round(final_score, 2)
|
|
grade = score_to_grade(final_score)
|
|
|
|
summary = self._generate_summary(metrics, llm_eval, final_score, grade)
|
|
|
|
return CombinedEvaluation(
|
|
metrics=metrics,
|
|
llm_evaluation=llm_eval,
|
|
final_score=final_score,
|
|
grade=grade,
|
|
summary=summary,
|
|
)
|
|
|
|
def evaluate_sync(
|
|
self,
|
|
report: str,
|
|
query: str,
|
|
report_style: str = "default",
|
|
) -> CombinedEvaluation:
|
|
"""Synchronous version of evaluate."""
|
|
import asyncio
|
|
|
|
return asyncio.run(self.evaluate(report, query, report_style))
|
|
|
|
def evaluate_metrics_only(
|
|
self,
|
|
report: str,
|
|
report_style: str = "default",
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Quick evaluation using only automated metrics (no LLM).
|
|
|
|
Args:
|
|
report: The report text to evaluate
|
|
report_style: The style of report
|
|
|
|
Returns:
|
|
Dictionary with metrics and score
|
|
"""
|
|
metrics = compute_metrics(report, report_style)
|
|
metrics_score = self._compute_metrics_score(metrics, report_style)
|
|
grade = score_to_grade(metrics_score)
|
|
|
|
return {
|
|
"metrics": metrics.to_dict(),
|
|
"score": metrics_score,
|
|
"grade": grade,
|
|
}
|