Source code for chatbot_eval.evaluation.evaluator

from __future__ import annotations

"""Evaluation orchestration for per-row scoring and aggregate runs."""

import json
from dataclasses import dataclass
from time import perf_counter
from typing import Any

from chatbot_eval.types import Sample


[docs] @dataclass(slots=True) class Evaluator: """Evaluate bots against samples and collect row-level outputs.""" metrics: list[object]
[docs] def evaluate_sample(self, sample: Sample, bot: Any) -> dict[str, str]: metric_values: dict[str, float] = {} metric_details: dict[str, dict] = {} started = perf_counter() try: bot_result = bot.answer(sample.question) latency_ms = (perf_counter() - started) * 1000.0 metric_values['latency_ms'] = round(latency_ms, 3) for metric in self.metrics: try: result = metric.score(sample, bot_result) metric_values[result.name] = result.score metric_details[result.name] = result.details except Exception as exc: name = getattr(metric, 'name', metric.__class__.__name__) metric_values[name] = 0.0 metric_details[name] = {'error': str(exc)} generated_answer = bot_result.answer bot_metadata = bot_result.metadata except Exception as exc: generated_answer = '' bot_metadata = {'error': str(exc)} metric_values['latency_ms'] = round((perf_counter() - started) * 1000.0, 3) metric_details['bot_error'] = {'error': str(exc)} return { 'bot_name': bot.name, 'question': sample.question, 'expected_answer': sample.expected_answer, 'generated_answer': generated_answer, 'metrics_json': json.dumps(metric_values, ensure_ascii=False), 'metric_details_json': json.dumps(metric_details, ensure_ascii=False), 'bot_metadata_json': json.dumps(bot_metadata, ensure_ascii=False), }
[docs] def evaluate_dataset(self, samples: list[Sample], bots: list[object]) -> list[dict[str, str]]: """Evaluate all ``bots`` against all ``samples``.""" rows: list[dict[str, str]] = [] for bot in bots: for sample in samples: rows.append(self.evaluate_sample(sample, bot)) return rows