bb5e3f5eb3
Параллель к 8a, но проверяем не код intent от роутера, а содержимое ответа
конкретной ветки на одиночную реплику. Старт — general_info, 46 кейсов.
Логика pass/fail (для одного кейса):
- A — RAG-секция: среди retrieved-чанков есть кусок с
section == expected_doc_section (точное совпадение). Если поле не задано —
пропускаем.
- B — keywords: обязательные expected_keywords встречаются в predicted_answer
(case-insensitive). По умолчанию все; поддерживаются keywords_min: N
и keywords_any: true. Запрещённые expected_must_not — ни одного.
- Pass = A ∧ B. Незаданные поля не проверяются.
- Кэш: (text_hash, branch_config_id) → {answer_text, retrieved_sections}.
Привязан к версии промпта ветки. Смена версии = пустой кэш = свежий прогон.
Правка JSONL без изменения text → pass/fail пересчитывается без LLM.
Backend:
- Таблицы eval_branch_runs / eval_branch_run_cases / eval_branch_predictions.
Миграция m9g1f7e89j56.
- services/eval_branch_run_service.py: загрузка JSONL, фоновый прогон через
asyncio.create_task, кэш, оценка A+B с поддержкой keywords_min/keywords_any.
- chat_service.run_branch_single_turn — изолированный single-turn без
роутера и треда (использует существующий config_service + vectorstore + llm).
- API: POST /eval/branch-runs, GET /eval/branch-runs?intent_code=,
GET /eval/branch-runs/{id}, GET /eval/branch-cases-with-status?intent_code=.
UI (static/regression.html):
- Селектор режима «Роутер / Ветка · general_info». Логика пикера переиспользуется
(фильтры, диапазон, массовый выбор, счётчик «новые / в кэше»).
- Для режима «Ветка»: фильтр по coverage, колонки секция/coverage, keywords,
частота, кэш. Drill-down прогона: ожидание, retrieved-секции, причины fail,
полный ответ ветки.
База кейсов (eval/branch_cases_general_info.jsonl) — от пользователя, 46 кейсов
по схеме {text, intent, coverage, expected_doc_section?, expected_keywords?,
expected_must_not?, keywords_min?, keywords_any?, count?, note?}.
Связанная правка SQLite (нашли при удалении документа в этом спринте):
- db/session.py: connect-listener PRAGMA foreign_keys=ON на каждое подключение.
Без этого ondelete=CASCADE в SQLite не enforced, и удаление документа
оставляло подписки в intent_documents висячими (что давало пустой RAG
и fail регрессии).
- Миграция n0h2g8f9a0k67 — одноразовая чистка существующих висячих подписок.
docs/SPRINTS.md: Спринт 8b → ✅ Закрыт. Diff vs предыдущий прогон для веток
и кнопка «Сбросить кэш регрессии» вынесены в docs/BACKLOG.md.
Также включены обновлённые data/datasets/general_info.md и price_question.md
(рабочий материал оператора), и черновик eval/branch_cases_price_question.jsonl
для следующего захода (8b на price_question).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
357 lines
13 KiB
Python
357 lines
13 KiB
Python
"""Эндпоинты eval-наборов и прогонов регрессии (Спринты 7.5 + 8a).
|
||
|
||
- `/eval/router-cases` — список кейсов классификатора для UI Настроек (готовые
|
||
фразы пациентов из реального корпуса).
|
||
- `/eval/runs` — прогоны регрессии роутера (Спринт 8a). POST запускает фоновый
|
||
прогон, GET возвращает историю и детали.
|
||
"""
|
||
import json
|
||
import logging
|
||
from pathlib import Path
|
||
|
||
from fastapi import APIRouter, Depends, HTTPException
|
||
from pydantic import BaseModel, Field
|
||
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
||
from db.models import AgentConfig
|
||
from db.session import get_session
|
||
from models.responses import (
|
||
EvalBranchRunCaseInfo,
|
||
EvalBranchRunDetailResponse,
|
||
EvalBranchRunInfo,
|
||
EvalBranchRunListResponse,
|
||
EvalRunCaseInfo,
|
||
EvalRunDetailResponse,
|
||
EvalRunDiffInfo,
|
||
EvalRunInfo,
|
||
EvalRunListResponse,
|
||
)
|
||
from services import eval_branch_run_service, eval_run_service
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
router = APIRouter(prefix="/eval", tags=["eval"])
|
||
|
||
EVAL_DIR = Path(__file__).resolve().parent.parent / "eval"
|
||
ROUTER_CASE_FILES = [
|
||
EVAL_DIR / "router_cases_booking.jsonl",
|
||
EVAL_DIR / "router_cases_other.jsonl",
|
||
]
|
||
|
||
|
||
# Кэш загруженных кейсов: грузим один раз при первом запросе. Файлы JSONL не
|
||
# меняются на лету (это часть репо), поэтому горячая перезагрузка не нужна.
|
||
_router_cases_cache: list[dict] | None = None
|
||
|
||
|
||
def _load_router_cases() -> list[dict]:
|
||
global _router_cases_cache
|
||
if _router_cases_cache is not None:
|
||
return _router_cases_cache
|
||
|
||
all_cases: list[dict] = []
|
||
for path in ROUTER_CASE_FILES:
|
||
if not path.is_file():
|
||
logger.warning("Router case file not found: %s", path)
|
||
continue
|
||
with path.open(encoding="utf-8") as f:
|
||
for line_no, raw in enumerate(f, 1):
|
||
line = raw.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
data = json.loads(line)
|
||
except json.JSONDecodeError as e:
|
||
logger.warning("%s:%d JSON decode error: %s", path.name, line_no, e)
|
||
continue
|
||
all_cases.append(data)
|
||
|
||
_router_cases_cache = all_cases
|
||
logger.info("Loaded %d router eval cases from %d file(s)", len(all_cases), len(ROUTER_CASE_FILES))
|
||
return all_cases
|
||
|
||
|
||
@router.get("/router-cases")
|
||
def list_router_cases(intent_code: str | None = None, limit: int = 500):
|
||
"""Список кейсов маршрутизатора, опционально с фильтром по `expected_intent`.
|
||
|
||
Сортировка — по `count` desc (самые частотные фразы вверху). `limit` ограничивает
|
||
объём, чтобы UI не давился на 800+ опциях datalist.
|
||
"""
|
||
cases = _load_router_cases()
|
||
|
||
if intent_code:
|
||
filtered = [c for c in cases if c.get("expected_intent") == intent_code]
|
||
else:
|
||
filtered = list(cases)
|
||
|
||
filtered.sort(key=lambda c: c.get("count", 0), reverse=True)
|
||
filtered = filtered[:max(1, min(limit, 5000))]
|
||
|
||
items = [
|
||
{
|
||
"text": c.get("text", ""),
|
||
"expected_intent": c.get("expected_intent", ""),
|
||
"count": int(c.get("count", 0) or 0),
|
||
"note": c.get("note") or None,
|
||
}
|
||
for c in filtered
|
||
]
|
||
|
||
return {
|
||
"intent_code": intent_code,
|
||
"total": len(items),
|
||
"cases": items,
|
||
}
|
||
|
||
|
||
# ---------- Прогоны регрессии (Спринт 8a) ----------
|
||
|
||
class StartRouterRunRequest(BaseModel):
|
||
suite: str = Field("router", description="Сейчас поддерживается только 'router'")
|
||
text_hashes: list[str] = Field(..., min_length=1, description="sha256(text) выбранных кейсов")
|
||
|
||
|
||
def _run_to_info(run, router_config_version: int | None) -> EvalRunInfo:
|
||
return EvalRunInfo(
|
||
id=run.id,
|
||
suite=run.suite,
|
||
router_config_id=run.router_config_id,
|
||
router_config_version=router_config_version,
|
||
min_count=run.min_count,
|
||
status=run.status,
|
||
total=run.total,
|
||
passed=run.passed,
|
||
failed=run.failed,
|
||
cache_hits=run.cache_hits,
|
||
error_text=run.error_text,
|
||
started_at=run.started_at.isoformat(),
|
||
finished_at=run.finished_at.isoformat() if run.finished_at else None,
|
||
)
|
||
|
||
|
||
def _case_to_info(c) -> EvalRunCaseInfo:
|
||
return EvalRunCaseInfo(
|
||
text=c.text,
|
||
expected_intent=c.expected_intent,
|
||
predicted_intent=c.predicted_intent,
|
||
count_weight=c.count_weight,
|
||
is_pass=c.is_pass,
|
||
)
|
||
|
||
|
||
async def _config_version(session: AsyncSession, config_id: int | None) -> int | None:
|
||
if config_id is None:
|
||
return None
|
||
cfg = await session.get(AgentConfig, config_id)
|
||
return cfg.version if cfg else None
|
||
|
||
|
||
@router.post("/runs", response_model=EvalRunInfo)
|
||
async def start_run(req: StartRouterRunRequest, session: AsyncSession = Depends(get_session)):
|
||
if req.suite != "router":
|
||
raise HTTPException(status_code=400, detail="Only suite='router' is supported in 8a")
|
||
try:
|
||
run = await eval_run_service.start_router_run(session, req.text_hashes)
|
||
except ValueError as e:
|
||
raise HTTPException(status_code=400, detail=str(e))
|
||
version = await _config_version(session, run.router_config_id)
|
||
return _run_to_info(run, version)
|
||
|
||
|
||
@router.get("/router-cases-with-status")
|
||
async def router_cases_with_status(session: AsyncSession = Depends(get_session)):
|
||
"""Все кейсы из JSONL, отсортированы по count desc, + кэш на активной версии роутера.
|
||
|
||
Под каждым кейсом — последний предсказанный intent для этой версии (если был),
|
||
и pass/fail сравнение с expected. UI строит таблицу выбора с массовыми операциями.
|
||
"""
|
||
cases = eval_run_service.load_all_router_cases()
|
||
router_config_id = await eval_run_service._resolve_active_router_config_id(session)
|
||
version = await _config_version(session, router_config_id)
|
||
cache = await eval_run_service.cached_predictions(session, router_config_id)
|
||
|
||
items = []
|
||
for idx, c in enumerate(cases, 1):
|
||
th = eval_run_service._text_hash(c.text)
|
||
cached_predicted = cache.get(th)
|
||
cached_is_pass = (
|
||
None if cached_predicted is None else cached_predicted == c.expected_intent
|
||
)
|
||
items.append({
|
||
"idx": idx,
|
||
"text": c.text,
|
||
"text_hash": th,
|
||
"expected_intent": c.expected_intent,
|
||
"count": c.count,
|
||
"cached_predicted": cached_predicted,
|
||
"cached_is_pass": cached_is_pass,
|
||
})
|
||
return {
|
||
"router_config_id": router_config_id,
|
||
"router_config_version": version,
|
||
"total": len(items),
|
||
"cases": items,
|
||
}
|
||
|
||
|
||
# ---------- Branch runs (Спринт 8b) ----------
|
||
|
||
class StartBranchRunRequest(BaseModel):
|
||
intent_code: str
|
||
text_hashes: list[str] = Field(..., min_length=1)
|
||
|
||
|
||
def _branch_run_to_info(run, version: int | None) -> EvalBranchRunInfo:
|
||
return EvalBranchRunInfo(
|
||
id=run.id,
|
||
suite=run.suite,
|
||
intent_code=run.intent_code,
|
||
branch_config_id=run.branch_config_id,
|
||
branch_config_version=version,
|
||
status=run.status,
|
||
total=run.total,
|
||
passed=run.passed,
|
||
failed=run.failed,
|
||
cache_hits=run.cache_hits,
|
||
error_text=run.error_text,
|
||
started_at=run.started_at.isoformat(),
|
||
finished_at=run.finished_at.isoformat() if run.finished_at else None,
|
||
)
|
||
|
||
|
||
def _branch_case_to_info(c) -> EvalBranchRunCaseInfo:
|
||
return EvalBranchRunCaseInfo(
|
||
text=c.text,
|
||
coverage=c.coverage,
|
||
expected_doc_section=c.expected_doc_section,
|
||
expected_keywords=json.loads(c.expected_keywords_json or "[]"),
|
||
expected_must_not=json.loads(c.expected_must_not_json or "[]"),
|
||
keywords_min=c.keywords_min,
|
||
predicted_answer=c.predicted_answer,
|
||
predicted_sections=json.loads(c.predicted_sections_json or "[]"),
|
||
is_pass=c.is_pass,
|
||
fail_reasons=json.loads(c.fail_reasons_json or "[]"),
|
||
count_weight=c.count_weight,
|
||
)
|
||
|
||
|
||
@router.get("/branch-cases-with-status")
|
||
async def branch_cases_with_status(
|
||
intent_code: str, session: AsyncSession = Depends(get_session)
|
||
):
|
||
"""Все кейсы JSONL для ветки + кэш на её активной версии."""
|
||
cases = eval_branch_run_service.load_branch_cases(intent_code)
|
||
branch_config_id = await eval_branch_run_service._resolve_active_branch_config_id(
|
||
session, intent_code,
|
||
)
|
||
version = await _config_version(session, branch_config_id)
|
||
cache = await eval_branch_run_service.cached_predictions(session, branch_config_id)
|
||
|
||
items = []
|
||
for idx, c in enumerate(cases, 1):
|
||
th = eval_branch_run_service._text_hash(c.text)
|
||
cached = cache.get(th)
|
||
cached_is_pass = None
|
||
cached_answer = None
|
||
cached_fail_reasons: list[str] = []
|
||
if cached is not None:
|
||
is_pass, reasons = eval_branch_run_service._evaluate_case(
|
||
c, cached["answer_text"], cached["retrieved_sections"],
|
||
)
|
||
cached_is_pass = is_pass
|
||
cached_answer = cached["answer_text"]
|
||
cached_fail_reasons = reasons
|
||
items.append({
|
||
"idx": idx,
|
||
"text": c.text,
|
||
"text_hash": th,
|
||
"intent_code": c.intent_code,
|
||
"coverage": c.coverage,
|
||
"expected_doc_section": c.expected_doc_section,
|
||
"expected_keywords": c.expected_keywords,
|
||
"expected_must_not": c.expected_must_not,
|
||
"keywords_min": c.keywords_min,
|
||
"keywords_any": c.keywords_any,
|
||
"count": c.count,
|
||
"note": c.note,
|
||
"cached_is_pass": cached_is_pass,
|
||
"cached_answer": cached_answer,
|
||
"cached_fail_reasons": cached_fail_reasons,
|
||
})
|
||
return {
|
||
"intent_code": intent_code,
|
||
"branch_config_id": branch_config_id,
|
||
"branch_config_version": version,
|
||
"total": len(items),
|
||
"cases": items,
|
||
}
|
||
|
||
|
||
@router.post("/branch-runs", response_model=EvalBranchRunInfo)
|
||
async def start_branch_run(
|
||
req: StartBranchRunRequest, session: AsyncSession = Depends(get_session)
|
||
):
|
||
try:
|
||
run = await eval_branch_run_service.start_branch_run(
|
||
session, req.intent_code, req.text_hashes,
|
||
)
|
||
except ValueError as e:
|
||
raise HTTPException(status_code=400, detail=str(e))
|
||
version = await _config_version(session, run.branch_config_id)
|
||
return _branch_run_to_info(run, version)
|
||
|
||
|
||
@router.get("/branch-runs", response_model=EvalBranchRunListResponse)
|
||
async def list_branch_runs(
|
||
intent_code: str | None = None, session: AsyncSession = Depends(get_session)
|
||
):
|
||
runs = await eval_branch_run_service.list_runs(session, intent_code, limit=50)
|
||
items = []
|
||
for r in runs:
|
||
version = await _config_version(session, r.branch_config_id)
|
||
items.append(_branch_run_to_info(r, version))
|
||
return EvalBranchRunListResponse(runs=items, total=len(items))
|
||
|
||
|
||
@router.get("/branch-runs/{run_id}", response_model=EvalBranchRunDetailResponse)
|
||
async def get_branch_run(run_id: int, session: AsyncSession = Depends(get_session)):
|
||
run = await eval_branch_run_service.get_run(session, run_id)
|
||
if run is None:
|
||
raise HTTPException(status_code=404, detail="Branch run not found")
|
||
version = await _config_version(session, run.branch_config_id)
|
||
cases = await eval_branch_run_service.list_run_cases(session, run_id)
|
||
return EvalBranchRunDetailResponse(
|
||
run=_branch_run_to_info(run, version),
|
||
cases=[_branch_case_to_info(c) for c in cases],
|
||
)
|
||
|
||
|
||
@router.get("/runs", response_model=EvalRunListResponse)
|
||
async def list_runs(session: AsyncSession = Depends(get_session)):
|
||
runs = await eval_run_service.list_runs(session, limit=50)
|
||
items = []
|
||
for r in runs:
|
||
version = await _config_version(session, r.router_config_id)
|
||
items.append(_run_to_info(r, version))
|
||
return EvalRunListResponse(runs=items, total=len(items))
|
||
|
||
|
||
@router.get("/runs/{run_id}", response_model=EvalRunDetailResponse)
|
||
async def get_run(run_id: int, session: AsyncSession = Depends(get_session)):
|
||
run = await eval_run_service.get_run(session, run_id)
|
||
if run is None:
|
||
raise HTTPException(status_code=404, detail="Run not found")
|
||
version = await _config_version(session, run.router_config_id)
|
||
cases = await eval_run_service.list_run_cases(session, run_id)
|
||
diff = await eval_run_service.compute_diff_vs_previous(session, run)
|
||
return EvalRunDetailResponse(
|
||
run=_run_to_info(run, version),
|
||
cases=[_case_to_info(c) for c in cases],
|
||
diff=EvalRunDiffInfo(
|
||
prev_run_id=diff.prev_run_id,
|
||
new_fails=[_case_to_info(c) for c in diff.new_fails],
|
||
new_passes=[_case_to_info(c) for c in diff.new_passes],
|
||
),
|
||
)
|