feat(sprint8b): регрессия ответов веток · general_info + фикс PRAGMA foreign_keys

Параллель к 8a, но проверяем не код intent от роутера, а содержимое ответа конкретной ветки на одиночную реплику. Старт — general_info, 46 кейсов. Логика pass/fail (для одного кейса): - A — RAG-секция: среди retrieved-чанков есть кусок с section == expected_doc_section (точное совпадение). Если поле не задано — пропускаем. - B — keywords: обязательные expected_keywords встречаются в predicted_answer (case-insensitive). По умолчанию все; поддерживаются keywords_min: N и keywords_any: true. Запрещённые expected_must_not — ни одного. - Pass = A ∧ B. Незаданные поля не проверяются. - Кэш: (text_hash, branch_config_id) → {answer_text, retrieved_sections}. Привязан к версии промпта ветки. Смена версии = пустой кэш = свежий прогон. Правка JSONL без изменения text → pass/fail пересчитывается без LLM. Backend: - Таблицы eval_branch_runs / eval_branch_run_cases / eval_branch_predictions. Миграция m9g1f7e89j56. - services/eval_branch_run_service.py: загрузка JSONL, фоновый прогон через asyncio.create_task, кэш, оценка A+B с поддержкой keywords_min/keywords_any. - chat_service.run_branch_single_turn — изолированный single-turn без роутера и треда (использует существующий config_service + vectorstore + llm). - API: POST /eval/branch-runs, GET /eval/branch-runs?intent_code=, GET /eval/branch-runs/{id}, GET /eval/branch-cases-with-status?intent_code=. UI (static/regression.html): - Селектор режима «Роутер / Ветка · general_info». Логика пикера переиспользуется (фильтры, диапазон, массовый выбор, счётчик «новые / в кэше»). - Для режима «Ветка»: фильтр по coverage, колонки секция/coverage, keywords, частота, кэш. Drill-down прогона: ожидание, retrieved-секции, причины fail, полный ответ ветки. База кейсов (eval/branch_cases_general_info.jsonl) — от пользователя, 46 кейсов по схеме {text, intent, coverage, expected_doc_section?, expected_keywords?, expected_must_not?, keywords_min?, keywords_any?, count?, note?}. Связанная правка SQLite (нашли при удалении документа в этом спринте): - db/session.py: connect-listener PRAGMA foreign_keys=ON на каждое подключение. Без этого ondelete=CASCADE в SQLite не enforced, и удаление документа оставляло подписки в intent_documents висячими (что давало пустой RAG и fail регрессии). - Миграция n0h2g8f9a0k67 — одноразовая чистка существующих висячих подписок. docs/SPRINTS.md: Спринт 8b → ✅ Закрыт. Diff vs предыдущий прогон для веток и кнопка «Сбросить кэш регрессии» вынесены в docs/BACKLOG.md. Также включены обновлённые data/datasets/general_info.md и price_question.md (рабочий материал оператора), и черновик eval/branch_cases_price_question.jsonl для следующего захода (8b на price_question). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 01:20:59 +05:00
parent a8f7e68795
commit bb5e3f5eb3
15 changed files with 1228 additions and 109 deletions
@@ -16,13 +16,17 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from db.models import AgentConfig
 from db.session import get_session
 from models.responses import (
+    EvalBranchRunCaseInfo,
+    EvalBranchRunDetailResponse,
+    EvalBranchRunInfo,
+    EvalBranchRunListResponse,
    EvalRunCaseInfo,
    EvalRunDetailResponse,
    EvalRunDiffInfo,
    EvalRunInfo,
    EvalRunListResponse,
 )
-from services import eval_run_service
+from services import eval_branch_run_service, eval_run_service

 logger = logging.getLogger(__name__)

@@ -191,6 +195,138 @@ async def router_cases_with_status(session: AsyncSession = Depends(get_session))
    }


+# ---------- Branch runs (Спринт 8b) ----------
+
+class StartBranchRunRequest(BaseModel):
+    intent_code: str
+    text_hashes: list[str] = Field(..., min_length=1)
+
+
+def _branch_run_to_info(run, version: int | None) -> EvalBranchRunInfo:
+    return EvalBranchRunInfo(
+        id=run.id,
+        suite=run.suite,
+        intent_code=run.intent_code,
+        branch_config_id=run.branch_config_id,
+        branch_config_version=version,
+        status=run.status,
+        total=run.total,
+        passed=run.passed,
+        failed=run.failed,
+        cache_hits=run.cache_hits,
+        error_text=run.error_text,
+        started_at=run.started_at.isoformat(),
+        finished_at=run.finished_at.isoformat() if run.finished_at else None,
+    )
+
+
+def _branch_case_to_info(c) -> EvalBranchRunCaseInfo:
+    return EvalBranchRunCaseInfo(
+        text=c.text,
+        coverage=c.coverage,
+        expected_doc_section=c.expected_doc_section,
+        expected_keywords=json.loads(c.expected_keywords_json or "[]"),
+        expected_must_not=json.loads(c.expected_must_not_json or "[]"),
+        keywords_min=c.keywords_min,
+        predicted_answer=c.predicted_answer,
+        predicted_sections=json.loads(c.predicted_sections_json or "[]"),
+        is_pass=c.is_pass,
+        fail_reasons=json.loads(c.fail_reasons_json or "[]"),
+        count_weight=c.count_weight,
+    )
+
+
+@router.get("/branch-cases-with-status")
+async def branch_cases_with_status(
+    intent_code: str, session: AsyncSession = Depends(get_session)
+):
+    """Все кейсы JSONL для ветки + кэш на её активной версии."""
+    cases = eval_branch_run_service.load_branch_cases(intent_code)
+    branch_config_id = await eval_branch_run_service._resolve_active_branch_config_id(
+        session, intent_code,
+    )
+    version = await _config_version(session, branch_config_id)
+    cache = await eval_branch_run_service.cached_predictions(session, branch_config_id)
+
+    items = []
+    for idx, c in enumerate(cases, 1):
+        th = eval_branch_run_service._text_hash(c.text)
+        cached = cache.get(th)
+        cached_is_pass = None
+        cached_answer = None
+        cached_fail_reasons: list[str] = []
+        if cached is not None:
+            is_pass, reasons = eval_branch_run_service._evaluate_case(
+                c, cached["answer_text"], cached["retrieved_sections"],
+            )
+            cached_is_pass = is_pass
+            cached_answer = cached["answer_text"]
+            cached_fail_reasons = reasons
+        items.append({
+            "idx": idx,
+            "text": c.text,
+            "text_hash": th,
+            "intent_code": c.intent_code,
+            "coverage": c.coverage,
+            "expected_doc_section": c.expected_doc_section,
+            "expected_keywords": c.expected_keywords,
+            "expected_must_not": c.expected_must_not,
+            "keywords_min": c.keywords_min,
+            "keywords_any": c.keywords_any,
+            "count": c.count,
+            "note": c.note,
+            "cached_is_pass": cached_is_pass,
+            "cached_answer": cached_answer,
+            "cached_fail_reasons": cached_fail_reasons,
+        })
+    return {
+        "intent_code": intent_code,
+        "branch_config_id": branch_config_id,
+        "branch_config_version": version,
+        "total": len(items),
+        "cases": items,
+    }
+
+
+@router.post("/branch-runs", response_model=EvalBranchRunInfo)
+async def start_branch_run(
+    req: StartBranchRunRequest, session: AsyncSession = Depends(get_session)
+):
+    try:
+        run = await eval_branch_run_service.start_branch_run(
+            session, req.intent_code, req.text_hashes,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    version = await _config_version(session, run.branch_config_id)
+    return _branch_run_to_info(run, version)
+
+
+@router.get("/branch-runs", response_model=EvalBranchRunListResponse)
+async def list_branch_runs(
+    intent_code: str | None = None, session: AsyncSession = Depends(get_session)
+):
+    runs = await eval_branch_run_service.list_runs(session, intent_code, limit=50)
+    items = []
+    for r in runs:
+        version = await _config_version(session, r.branch_config_id)
+        items.append(_branch_run_to_info(r, version))
+    return EvalBranchRunListResponse(runs=items, total=len(items))
+
+
+@router.get("/branch-runs/{run_id}", response_model=EvalBranchRunDetailResponse)
+async def get_branch_run(run_id: int, session: AsyncSession = Depends(get_session)):
+    run = await eval_branch_run_service.get_run(session, run_id)
+    if run is None:
+        raise HTTPException(status_code=404, detail="Branch run not found")
+    version = await _config_version(session, run.branch_config_id)
+    cases = await eval_branch_run_service.list_run_cases(session, run_id)
+    return EvalBranchRunDetailResponse(
+        run=_branch_run_to_info(run, version),
+        cases=[_branch_case_to_info(c) for c in cases],
+    )
+
+
@router.get("/runs", response_model=EvalRunListResponse)
 async def list_runs(session: AsyncSession = Depends(get_session)):
    runs = await eval_run_service.list_runs(session, limit=50)