Source code for chatbot_eval.evaluation.evaluator

from __future__ import annotations

"""Evaluation orchestration for per-row scoring and aggregate runs."""

import json
from dataclasses import dataclass
from time import perf_counter
from typing import Any

from chatbot_eval.types import Sample



[docs]
@dataclass(slots=True)
class Evaluator:
    """Evaluate bots against samples and collect row-level outputs."""

    metrics: list[object]


[docs]
    def evaluate_sample(self, sample: Sample, bot: Any) -> dict[str, str]:
        metric_values: dict[str, float] = {}
        metric_details: dict[str, dict] = {}
        started = perf_counter()
        try:
            bot_result = bot.answer(sample.question)
            latency_ms = (perf_counter() - started) * 1000.0
            metric_values['latency_ms'] = round(latency_ms, 3)
            for metric in self.metrics:
                try:
                    result = metric.score(sample, bot_result)
                    metric_values[result.name] = result.score
                    metric_details[result.name] = result.details
                except Exception as exc:
                    name = getattr(metric, 'name', metric.__class__.__name__)
                    metric_values[name] = 0.0
                    metric_details[name] = {'error': str(exc)}
            generated_answer = bot_result.answer
            bot_metadata = bot_result.metadata
        except Exception as exc:
            generated_answer = ''
            bot_metadata = {'error': str(exc)}
            metric_values['latency_ms'] = round((perf_counter() - started) * 1000.0, 3)
            metric_details['bot_error'] = {'error': str(exc)}
        return {
            'bot_name': bot.name,
            'question': sample.question,
            'expected_answer': sample.expected_answer,
            'generated_answer': generated_answer,
            'metrics_json': json.dumps(metric_values, ensure_ascii=False),
            'metric_details_json': json.dumps(metric_details, ensure_ascii=False),
            'bot_metadata_json': json.dumps(bot_metadata, ensure_ascii=False),
        }



[docs]
    def evaluate_dataset(self, samples: list[Sample], bots: list[object]) -> list[dict[str, str]]:
        """Evaluate all ``bots`` against all ``samples``."""

        rows: list[dict[str, str]] = []
        for bot in bots:
            for sample in samples:
                rows.append(self.evaluate_sample(sample, bot))
        return rows