Source code for chatbot_eval.metrics.basic

from __future__ import annotations

"""Deterministic baseline metrics used by the evaluator."""

import re
from dataclasses import dataclass

from chatbot_eval.types import BotResult, MetricResult, Sample


def _normalize(text: str) -> str:
    return re.sub(r'\s+', ' ', text.strip().lower())


def _tokens(text: str) -> set[str]:
    return {token.strip(".,!?;:()[]{}\"'").lower() for token in text.split() if token.strip()}



[docs]
@dataclass(slots=True)
class ExactMatchMetric:
    """Binary exact-match score after simple normalization."""

    name: str = 'exact_match'


[docs]
    def score(self, sample: Sample, bot_result: BotResult) -> MetricResult:
        value = 1.0 if _normalize(sample.expected_answer) == _normalize(bot_result.answer) else 0.0
        return MetricResult(name=self.name, score=value)





[docs]
@dataclass(slots=True)
class KeywordRecallMetric:
    """Recall of expected-answer tokens present in the generated answer."""

    name: str = 'keyword_recall'


[docs]
    def score(self, sample: Sample, bot_result: BotResult) -> MetricResult:
        expected = _tokens(sample.expected_answer)
        got = _tokens(bot_result.answer)
        score = len(expected & got) / len(expected) if expected else 0.0
        return MetricResult(name=self.name, score=round(score, 4))





[docs]
@dataclass(slots=True)
class AnswerLengthMetric:
    """Character length of the answer as a communication proxy."""

    name: str = 'answer_length_chars'


[docs]
    def score(self, sample: Sample, bot_result: BotResult) -> MetricResult:
        return MetricResult(name=self.name, score=float(len(bot_result.answer)))





[docs]
@dataclass(slots=True)
class PolitenessMetric:
    """Simple heuristic scoring polite or helpful markers in the answer."""

    name: str = 'politeness'


[docs]
    def score(self, sample: Sample, bot_result: BotResult) -> MetricResult:
        text = bot_result.answer.lower()
        markers = ['please', 'sorry', 'happy to help', 'can help']
        found = sum(1 for marker in markers if marker in text)
        score = min(found / 2.0, 1.0)
        return MetricResult(name=self.name, score=round(score, 4))