feat(sprint8.5+8.6): чанкер v2 (иерархия H1/H2/H3) + регрессия 4 веток в UI

Sprint 8.5 — чанкер v2 (services/document_processor.py): - markdown-it-py для md-входа: каждый H2 открывает свою секцию, H3 идёт в тело - множественные H1 — штатный кейс (new_booking.md = 8 H1, шаги воронки + группы); H1 без H2 → секция heading=H1; преамбула H1 (тело до первого H2) игнорируется - YAML frontmatter (--- ... ---) отрезается, в индекс не попадает - breadcrumb «## {H2}» как первая строка каждого subchunk'а - merge коротких хвостов и sentence-overlap — только внутри одной H2-секции - excluded_section_headings в config.py - 17 unit-тестов на stdlib unittest (tests/test_document_processor_v2.py), включая smoke по реальным general_info.md (тимпанометрия → правильная секция) и new_booking.md (защита от регрессии множественных H1) - ТЗ: docs/CHUNKER_v2_TZ.md Sprint 8.6 — регрессия остальных 4 веток (static/regression.html): - 4 опции в селекторе режима: branch:price_question (40 кейсов), branch:medical_question (29), branch:escalate_human (14), branch:reschedule (16) - бэкенд из 8b уже параметрический — правок в сервисе не потребовалось - new_booking вне скоупа — state-machine, под него отдельный 8c (multi-turn) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 09:59:12 +05:00
parent bb5e3f5eb3
commit 4aac59313d
7 changed files with 692 additions and 58 deletions
@@ -2,11 +2,13 @@ import io
 import logging
 import re
 import uuid
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path

 import fitz  # pymupdf
+import yaml
 from docx import Document as DocxDocument
+from markdown_it import MarkdownIt

 from config import settings
 from services.text_cleanup import clean_markdown_text
@@ -32,6 +34,13 @@ class Chunk:
    chunk_index: int = 0


+@dataclass
+class ParsedMarkdown:
+    """Результат парсинга md: frontmatter (если был) + список H2-секций."""
+    frontmatter: dict = field(default_factory=dict)
+    sections: list[ParsedSection] = field(default_factory=list)
+
+
 # --- Parsers ---


@@ -194,6 +203,123 @@ def parse_text(file_bytes: bytes, is_markdown: bool = False) -> list[ParsedSecti
    return sections


+# --- Markdown parser v2 (иерархия H1/H2/H3, frontmatter, второй H1 → cut) ---
+
+
+def _split_frontmatter(text: str) -> tuple[dict, str]:
+    """Если файл начинается со строки `---`, отрезает YAML-frontmatter и парсит его.
+
+    Возвращает (frontmatter_dict, body_text). Если frontmatter не найден или невалиден —
+    словарь пустой, body_text == исходный text.
+    """
+    if not text.startswith("---"):
+        return {}, text
+    lines = text.split("\n")
+    if not lines or lines[0].strip() != "---":
+        return {}, text
+    end = -1
+    for i in range(1, len(lines)):
+        if lines[i].strip() == "---":
+            end = i
+            break
+    if end == -1:
+        return {}, text
+    fm_text = "\n".join(lines[1:end])
+    body = "\n".join(lines[end + 1:]).lstrip("\n")
+    try:
+        fm = yaml.safe_load(fm_text) or {}
+    except yaml.YAMLError as exc:
+        logger.warning("Failed to parse YAML frontmatter: %s", exc)
+        return {}, text
+    if not isinstance(fm, dict):
+        return {}, body
+    return fm, body
+
+
+def parse_markdown(text: str, source_label: str = "") -> ParsedMarkdown:
+    """Парсер markdown с иерархией H1/H2/H3.
+
+    Правила (см. docs/CHUNKER_v2_TZ.md):
+    - YAML frontmatter (`--- ... ---` в самом начале) — отрезается, в текст не идёт.
+    - Каждый H1 — это «корневая секция» документа. Множественные H1 поддерживаются
+      штатно (например, `new_booking.md` имеет 8 H1 — шаги воронки).
+    - Внутри H1 каждый H2 открывает свою секцию (heading H2 → ParsedSection.heading).
+      Преамбула H1 (тело между H1 и его первым H2) игнорируется — обычно это служебка
+      или вступление, дублирующее заголовок.
+    - Если внутри H1 нет ни одного H2 — H1 сам становится одной секцией с heading H1
+      и body = всё его содержимое.
+    - H3 и ниже не открывают секций — идут в тело текущей H2 как есть (`### {текст}`).
+    """
+    frontmatter, body_text = _split_frontmatter(text)
+    _ = source_label  # пока используется только потенциально для будущих логов
+
+    md = MarkdownIt("commonmark")
+    tokens = md.parse(body_text)
+    lines = body_text.split("\n")
+    total_lines = len(lines)
+
+    # (level, start_line, body_start_line, heading_text)
+    headings: list[tuple[int, int, int, str]] = []
+    i = 0
+    while i < len(tokens):
+        t = tokens[i]
+        if t.type == "heading_open" and t.map is not None:
+            level = int(t.tag[1])
+            inline = tokens[i + 1] if i + 1 < len(tokens) else None
+            heading_text = inline.content.strip() if inline is not None else ""
+            headings.append((level, t.map[0], t.map[1], heading_text))
+        i += 1
+
+    h1_positions = [idx for idx, h in enumerate(headings) if h[0] == 1]
+    sections: list[ParsedSection] = []
+
+    # Файл без H1 — старый кейс (только H2). Обрабатываем как «один виртуальный H1»
+    # с диапазоном на весь файл, чтобы не ветвиться.
+    if not h1_positions:
+        h1_groups = [(0, total_lines, "", -1)]  # (h1_body_start, h1_end_line, h1_heading, h1_idx)
+    else:
+        h1_groups = []
+        for pos_idx, h_idx in enumerate(h1_positions):
+            _, _, h1_body_start, h1_text = headings[h_idx]
+            if pos_idx + 1 < len(h1_positions):
+                h1_end = headings[h1_positions[pos_idx + 1]][1]
+            else:
+                h1_end = total_lines
+            h1_groups.append((h1_body_start, h1_end, h1_text, h_idx))
+
+    for h1_body_start, h1_end, h1_heading, h1_idx in h1_groups:
+        # H2-индексы внутри текущего H1-диапазона.
+        h2_indices_in_group = [
+            idx for idx, h in enumerate(headings)
+            if h[0] == 2 and h1_body_start <= h[1] < h1_end
+        ]
+
+        if h2_indices_in_group:
+            for pos, idx in enumerate(h2_indices_in_group):
+                _, _, body_start, heading_text = headings[idx]
+                if pos + 1 < len(h2_indices_in_group):
+                    body_end = headings[h2_indices_in_group[pos + 1]][1]
+                else:
+                    body_end = h1_end
+                body = "\n".join(lines[body_start:body_end]).strip()
+                sections.append(ParsedSection(
+                    heading=heading_text,
+                    heading_level=2,
+                    body=body,
+                ))
+        else:
+            # H1 без H2 — body H1 идёт одной секцией с heading=H1.
+            body = "\n".join(lines[h1_body_start:h1_end]).strip()
+            if body or h1_heading:
+                sections.append(ParsedSection(
+                    heading=h1_heading,
+                    heading_level=1 if h1_heading else 2,
+                    body=body,
+                ))
+
+    return ParsedMarkdown(frontmatter=frontmatter, sections=sections)
+
+
 # --- Chunker ---


@@ -208,70 +334,89 @@ def chunk_sections(
    min_chunk_size: int | None = None,
    overlap_sentences: int | None = None,
 ) -> list[Chunk]:
-    """Чанкинг wiki-секций.
+    """Чанкинг секций с инвариантом «один чанк ⊆ одна H2-секция».

-    - Малые секции (FAQ-ответы) держим целиком — один чанк = одна тема.
-    - Большие секции (регламенты) режем по абзацам, с overlap последних N предложений.
-    - Мелкие соседние секции склеиваем, чтобы не плодить огрызки.
+    Ключевые правила (см. docs/CHUNKER_v2_TZ.md):
+    - Внутри секции разрезаем тело по абзацам (`\\n\\n`).
+    - В каждом subchunk-е первая строка — breadcrumb `## {heading H2}`.
+    - Merge коротких хвостов и sentence-overlap работают только внутри одной секции.
+    - Секции с heading из `excluded_section_headings` пропускаются.
+    - Секции с пустым heading (PDF/DOCX без заголовка) индексируются без breadcrumb,
+      чтобы не терять контент при reindex наследия. Для md-входа таких не бывает.
    """
    max_size = max_chunk_size or settings.max_chunk_size
    min_size = min_chunk_size or settings.min_chunk_size
    overlap = overlap_sentences or settings.overlap_sentences
+    excluded = set(settings.excluded_section_headings or [])

-    raw_chunks: list[Chunk] = []
+    final: list[Chunk] = []

    for section in sections:
-        heading_prefix = f"{section.heading}\n\n" if section.heading else ""
-        full_text = heading_prefix + section.body
+        if section.heading and section.heading in excluded:
+            continue
+        body = section.body.strip()
+        if not body and not section.heading:
+            continue
+
+        breadcrumb = f"## {section.heading}" if section.heading else ""
+        if breadcrumb:
+            full_text = f"{breadcrumb}\n\n{body}" if body else breadcrumb
+        else:
+            full_text = body

        if len(full_text) <= max_size:
-            raw_chunks.append(Chunk(
-                text=full_text.strip(),
+            section_chunks = [full_text]
+        else:
+            paragraphs = [p.strip() for p in re.split(r"\n{2,}", body) if p.strip()]
+            section_chunks = []
+            current = breadcrumb
+            for para in paragraphs:
+                # Стоимость склейки: текущий + "\n\n" + para.
+                projected = (current + "\n\n" + para) if current else para
+                if len(projected) > max_size and current and current != breadcrumb:
+                    section_chunks.append(current)
+                    current = f"{breadcrumb}\n\n{para}" if breadcrumb else para
+                else:
+                    current = projected
+            if current and current != breadcrumb:
+                section_chunks.append(current)
+
+        # Merge коротких хвостов — только внутри одной секции.
+        merged: list[str] = []
+        for ch in section_chunks:
+            if merged and len(merged[-1]) < min_size:
+                merged[-1] = merged[-1] + "\n\n" + ch
+            else:
+                merged.append(ch)
+
+        # Sentence-overlap — только между subchunk'ами одной секции.
+        if overlap > 0 and len(merged) > 1:
+            with_overlap = [merged[0]]
+            for i in range(1, len(merged)):
+                prev_sentences = _split_sentences(merged[i - 1])
+                overlap_text = " ".join(prev_sentences[-overlap:])
+                if not overlap_text or overlap_text in merged[i]:
+                    with_overlap.append(merged[i])
+                    continue
+                cur = merged[i]
+                # Вставляем overlap после breadcrumb-строки, чтобы заголовок остался первой строкой.
+                if breadcrumb and cur.startswith(breadcrumb + "\n\n"):
+                    rest = cur[len(breadcrumb) + 2:]
+                    new_text = f"{breadcrumb}\n\n{overlap_text}\n\n{rest}"
+                else:
+                    new_text = f"{overlap_text}\n\n{cur}"
+                with_overlap.append(new_text)
+            merged = with_overlap
+
+        for ch_text in merged:
+            final.append(Chunk(
+                text=ch_text.strip(),
                section=section.heading,
                page_number=section.page_number,
            ))
-        else:
-            paragraphs = section.body.split("\n")
-            current_text = heading_prefix
-            for para in paragraphs:
-                if len(current_text) + len(para) + 1 > max_size and len(current_text) > len(heading_prefix):
-                    raw_chunks.append(Chunk(
-                        text=current_text.strip(),
-                        section=section.heading,
-                        page_number=section.page_number,
-                    ))
-                    current_text = heading_prefix + para + "\n"
-                else:
-                    current_text += para + "\n"
-            if current_text.strip() and current_text.strip() != heading_prefix.strip():
-                raw_chunks.append(Chunk(
-                    text=current_text.strip(),
-                    section=section.heading,
-                    page_number=section.page_number,
-                ))

-    merged: list[Chunk] = []
-    for chunk in raw_chunks:
-        if merged and len(merged[-1].text) < min_size:
-            merged[-1].text += "\n\n" + chunk.text
-            if not merged[-1].section:
-                merged[-1].section = chunk.section
-        else:
-            merged.append(Chunk(
-                text=chunk.text,
-                section=chunk.section,
-                page_number=chunk.page_number,
-            ))
-
-    final: list[Chunk] = []
-    for i, chunk in enumerate(merged):
-        if i > 0 and overlap > 0:
-            prev_sentences = _split_sentences(merged[i - 1].text)
-            overlap_text = " ".join(prev_sentences[-overlap:])
-            if overlap_text and overlap_text not in chunk.text:
-                chunk.text = overlap_text + "\n\n" + chunk.text
-        chunk.chunk_index = i
-        final.append(chunk)
+    for i, c in enumerate(final):
+        c.chunk_index = i

    return final

@@ -280,12 +425,16 @@ def chunk_sections(


 def _sections_to_markdown(sections: list[ParsedSection]) -> str:
-    """Собрать секции в markdown-подобный текст — используется как raw_text для PDF/DOCX,
-    чтобы при переиндексации можно было снова пропустить через parse_text."""
+    """Собрать секции в markdown-подобный текст для повторной нарезки.
+
+    Все секции пишем как H2 — это нормализует выгрузки PDF/DOCX, где `heading_level`
+    может быть 1 или 2. Иначе reindex через `parse_markdown` потерял бы контент:
+    одиночный H1 трактуется как корень документа, второй H1 → WARN-обрыв.
+    """
    parts = []
    for s in sections:
        if s.heading:
-            parts.append(f"{'#' * max(1, s.heading_level)} {s.heading}")
+            parts.append(f"## {s.heading}")
        if s.body:
            parts.append(s.body)
    return "\n\n".join(parts).strip()
@@ -310,8 +459,9 @@ def process_document(
        raw_text = _sections_to_markdown(sections)
    elif ext == ".md":
        raw_text = file_bytes.decode("utf-8", errors="replace")
-        cleaned = clean_markdown_text(raw_text)
-        sections = parse_text(cleaned.encode("utf-8"), is_markdown=True)
+        _, body_text = _split_frontmatter(raw_text)
+        cleaned = clean_markdown_text(body_text)
+        sections = parse_markdown(cleaned, source_label=filename).sections
    elif ext == ".txt":
        raw_text = file_bytes.decode("utf-8", errors="replace")
        sections = parse_text(raw_text.encode("utf-8"), is_markdown=False)
@@ -339,8 +489,9 @@ def process_document(

 def rechunk_raw_text(raw_text: str) -> list[Chunk]:
    """Для переиндексации: режем сохранённый текст с актуальными правилами чистки."""
-    cleaned = clean_markdown_text(raw_text)
-    sections = parse_text(cleaned.encode("utf-8"), is_markdown=True)
+    _, body_text = _split_frontmatter(raw_text)
+    cleaned = clean_markdown_text(body_text)
+    sections = parse_markdown(cleaned).sections
    for s in sections:
        s.heading = clean_markdown_text(s.heading) if s.heading else ""
        s.body = clean_markdown_text(s.body)