Source code for chatbot_eval.bots.semantic_match

from __future__ import annotations

"""Deterministic FAQ matcher based on cosine overlap of token sets."""

from dataclasses import dataclass
from math import sqrt
from pathlib import Path

from chatbot_eval.bots.base import BaseBot
from chatbot_eval.io.csv_loader import load_samples_from_csv
from chatbot_eval.types import BotResult, Sample


def _tokenize(text: str) -> set[str]:
    """Tokenize text into a lower-cased set with light punctuation stripping."""

    return {token.strip(".,!?;:()[]{}\"'").lower() for token in text.split() if token.strip()}


def _cosine_overlap(a: str, b: str) -> float:
    """Return cosine similarity over binary bag-of-words vectors."""

    ta = _tokenize(a)
    tb = _tokenize(b)
    if not ta or not tb:
        return 0.0
    vocab = ta | tb
    va = [1.0 if token in ta else 0.0 for token in vocab]
    vb = [1.0 if token in tb else 0.0 for token in vocab]
    dot = sum(x * y for x, y in zip(va, vb))
    norm_a = sqrt(sum(x * x for x in va))
    norm_b = sqrt(sum(y * y for y in vb))
    return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0


[docs] @dataclass(slots=True) class StrictSemanticMatchBot(BaseBot): """Return the answer from the most similar FAQ question without generation.""" name: str faq_csv_path: str | Path
[docs] def answer(self, question: str) -> BotResult: samples = load_samples_from_csv(self.faq_csv_path) best_sample: Sample | None = None best_score = -1.0 for sample in samples: score = _cosine_overlap(question, sample.question) if score > best_score: best_score = score best_sample = sample answer = best_sample.expected_answer if best_sample else '' return BotResult( answer=answer, metadata={ 'bot_type': 'strict_semantic_match', 'faq_csv_path': str(self.faq_csv_path), 'matched_question': best_sample.question if best_sample else None, 'similarity': round(best_score, 4) if best_sample else None, }, )