RAG_helper/tests/test_document_processor_v2.py

"""Юнит-тесты чанкера v2 (Спринт 8.5).

Проверяет инварианты из docs/CHUNKER_v2_TZ.md:
- YAML frontmatter не утекает в чанки.
- Только H2 открывает секцию; H3 уходит в тело.
- Второй H1 → WARN + обрыв.
- Breadcrumb `## {H2}` в каждом subchunk.
- Merge и overlap не пересекают границы H2.
- Нумерованные списки в md не парсятся как заголовки.
- На реальном general_info.md чанк с «Тимпанометрия» имеет section="Направления приёма".

Запуск из корня репо: `python -m unittest tests.test_document_processor_v2 -v`
"""
import logging
import sys
import unittest
from pathlib import Path

# Корень репозитория в sys.path, чтобы импортировать services.* без установки пакета.
REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from services.document_processor import (  # noqa: E402
    Chunk,
    ParsedSection,
    _split_frontmatter,
    chunk_sections,
    parse_markdown,
    process_document,
    rechunk_raw_text,
)


class FrontmatterTests(unittest.TestCase):
    def test_no_frontmatter_returns_text_as_is(self):
        text = "# Title\n\n## Sec\n\nbody"
        fm, body = _split_frontmatter(text)
        self.assertEqual(fm, {})
        self.assertEqual(body, text)

    def test_yaml_frontmatter_is_stripped(self):
        text = "---\nintent: general_info\nsources: [a, b]\n---\n\n# Title\n\n## Sec\n\nbody"
        fm, body = _split_frontmatter(text)
        self.assertEqual(fm, {"intent": "general_info", "sources": ["a", "b"]})
        self.assertTrue(body.startswith("# Title"))
        self.assertNotIn("intent:", body)
        self.assertNotIn("---", body.split("\n", 1)[0])

    def test_invalid_yaml_falls_back_to_empty_fm(self):
        text = "---\n: : invalid yaml :\n---\n\n# Title\n## Sec\nbody"
        fm, body = _split_frontmatter(text)
        # Парсинг не сломал процесс; frontmatter пустой, тело — не доверяем формату,
        # поэтому возвращаем исходный текст как body, чтобы не потерять содержимое.
        self.assertEqual(fm, {})


class MultipleH1Tests(unittest.TestCase):
    def test_multiple_h1_with_h2_inside_each(self):
        text = (
            "# Step intro\n"
            "## Tone\n"
            "intro tone body\n\n"
            "# Step qualify\n"
            "## Template\n"
            "qualify template body\n\n"
            "## Guards\n"
            "qualify guards body\n\n"
            "# Step book\n"
            "## Confirmation\n"
            "book confirmation body\n"
        )
        parsed = parse_markdown(text)
        headings = [s.heading for s in parsed.sections]
        self.assertEqual(headings, ["Tone", "Template", "Guards", "Confirmation"])
        # Тела не перетекают между H1.
        for s in parsed.sections:
            if s.heading == "Tone":
                self.assertIn("intro tone body", s.body)
                self.assertNotIn("template", s.body.lower())
                self.assertNotIn("confirmation", s.body.lower())
            if s.heading == "Confirmation":
                self.assertIn("book confirmation body", s.body)
                self.assertNotIn("intro", s.body.lower())

    def test_h1_without_h2_becomes_section_heading_h1(self):
        text = (
            "# Step close\n"
            "Closing line one. Closing line two. Closing line three.\n"
        )
        parsed = parse_markdown(text)
        self.assertEqual(len(parsed.sections), 1)
        self.assertEqual(parsed.sections[0].heading, "Step close")
        self.assertIn("Closing line", parsed.sections[0].body)

    def test_h1_preamble_before_first_h2_is_dropped(self):
        # Преамбула H1 (тело до первого H2) игнорируется по правилу ТЗ —
        # обычно это вступление/служебка, дублирующая заголовок.
        text = (
            "# Doc\n"
            "intro line that must not become a section\n\n"
            "## Real H2\n"
            "real h2 body content here.\n"
        )
        parsed = parse_markdown(text)
        self.assertEqual(len(parsed.sections), 1)
        self.assertEqual(parsed.sections[0].heading, "Real H2")
        self.assertNotIn("intro line that must not", parsed.sections[0].body)

    def test_multiple_h1_no_warnings(self):
        # Старое поведение — WARN на втором H1 — отозвано.
        text = "# H1 one\n## A\nbody a\n\n# H1 two\n## B\nbody b\n"
        with self.assertLogs("services.document_processor", level="WARNING") as cap:
            parse_markdown(text, source_label="multi.md")
            # assertLogs требует хотя бы одну запись — добавим dummy, чтобы не упасть,
            # если их действительно нет.
            logging.getLogger("services.document_processor").warning("noop-for-assert-logs")
        warnings_about_h1 = [m for m in cap.output if "second H1" in m]
        self.assertEqual(warnings_about_h1, [])


class H3InBodyTests(unittest.TestCase):
    def test_h3_does_not_open_new_section(self):
        text = (
            "# Doc\n"
            "## Doctors\n"
            "intro line\n\n"
            "### ENT\n"
            "Petrov, Ivanov\n\n"
            "### Allergists\n"
            "Smirnova\n"
        )
        parsed = parse_markdown(text)
        self.assertEqual(len(parsed.sections), 1)
        self.assertEqual(parsed.sections[0].heading, "Doctors")
        body = parsed.sections[0].body
        self.assertIn("### ENT", body)
        self.assertIn("### Allergists", body)
        self.assertIn("Petrov, Ivanov", body)

        chunks = chunk_sections(parsed.sections)
        self.assertEqual(len(chunks), 1)
        self.assertEqual(chunks[0].section, "Doctors")
        self.assertTrue(chunks[0].text.startswith("## Doctors\n\n"))
        self.assertIn("### ENT", chunks[0].text)


class NumberedListTests(unittest.TestCase):
    def test_numbered_list_items_not_treated_as_headings(self):
        text = (
            "# Doc\n"
            "## Tax certificate\n"
            "How to receive:\n\n"
            "1. In person at the clinic — the easiest option.\n"
            "2. By email — write to mail@clinic.ru.\n"
            "3. Directly to the tax office.\n"
        )
        parsed = parse_markdown(text)
        self.assertEqual(len(parsed.sections), 1)
        self.assertEqual(parsed.sections[0].heading, "Tax certificate")
        chunks = chunk_sections(parsed.sections)
        self.assertEqual(len(chunks), 1)
        # Все три пункта в одном чанке с одной секцией.
        for needle in ("1. In person", "2. By email", "3. Directly"):
            self.assertIn(needle, chunks[0].text)
        # Никакой section не должен начинаться с цифры.
        for c in chunks:
            self.assertFalse(c.section[:1].isdigit(), f"section={c.section!r}")


class LongSectionSplitTests(unittest.TestCase):
    def test_long_section_splits_with_breadcrumb_and_same_section(self):
        para = "Sentence one. Sentence two. Sentence three. " * 8  # ~350 chars
        text = (
            "# Doc\n"
            "## Big section\n"
            + "\n\n".join([para] * 6)  # ~2 KB body, существенно больше max_chunk_size=1200
            + "\n"
        )
        parsed = parse_markdown(text)
        chunks = chunk_sections(parsed.sections, max_chunk_size=600, min_chunk_size=100, overlap_sentences=0)
        self.assertGreater(len(chunks), 1)
        for c in chunks:
            self.assertEqual(c.section, "Big section")
            self.assertTrue(c.text.startswith("## Big section\n\n"), f"chunk text starts with: {c.text[:30]!r}")

    def test_merge_and_overlap_do_not_cross_h2_boundaries(self):
        # Две короткие секции — merge между ними не должен случиться.
        text = (
            "# Doc\n"
            "## Alpha\n"
            "alpha body short.\n\n"
            "## Beta\n"
            "beta body short.\n"
        )
        parsed = parse_markdown(text)
        chunks = chunk_sections(parsed.sections, max_chunk_size=1200, min_chunk_size=500, overlap_sentences=2)
        sections = sorted({c.section for c in chunks})
        self.assertEqual(sections, ["Alpha", "Beta"])
        for c in chunks:
            if c.section == "Alpha":
                self.assertNotIn("beta body", c.text)
                self.assertNotIn("## Beta", c.text)
            else:
                self.assertNotIn("alpha body", c.text)
                self.assertNotIn("## Alpha", c.text)


class FrontmatterDoesNotLeakTests(unittest.TestCase):
    def test_frontmatter_not_in_chunks(self):
        text = (
            "---\n"
            "intent: general_info\n"
            "secret: do-not-leak-this-token\n"
            "---\n\n"
            "# Doc\n"
            "## Sec\n"
            "real body line one. real body line two. real body line three.\n"
        )
        # Прогоняем через rechunk_raw_text — это flow реиндексации.
        chunks = rechunk_raw_text(text)
        self.assertGreater(len(chunks), 0)
        for c in chunks:
            self.assertNotIn("do-not-leak-this-token", c.text)
            self.assertNotIn("intent: general_info", c.text)
        # Первый чанк начинается с breadcrumb первого H2.
        self.assertTrue(chunks[0].text.startswith("## Sec"))


class RealGeneralInfoTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        path = REPO_ROOT / "data" / "datasets" / "general_info.md"
        cls.path = path
        cls.file_bytes = path.read_bytes()

    def test_processes_without_errors(self):
        _, _, sections, chunks = process_document(self.file_bytes, self.path.name)
        self.assertGreater(len(sections), 5)
        self.assertGreater(len(chunks), 5)

    def test_no_chunk_with_empty_or_numeric_section(self):
        _, _, _, chunks = process_document(self.file_bytes, self.path.name)
        for c in chunks:
            self.assertTrue(c.section, f"empty section in chunk: {c.text[:60]!r}")
            self.assertFalse(
                c.section[:1].isdigit(),
                f"section starts with digit: {c.section!r}",
            )

    def test_tympanometry_chunk_lives_in_napravleniya_priema(self):
        _, _, _, chunks = process_document(self.file_bytes, self.path.name)
        matches = [c for c in chunks if "тимпанометр" in c.text.lower()]
        self.assertTrue(matches, "no chunk contains 'тимпанометр' — datasets changed?")
        for c in matches:
            self.assertEqual(
                c.section,
                "Направления приёма",
                f"tympanometry chunk has wrong section: {c.section!r}",
            )

    def test_breadcrumb_in_every_chunk(self):
        _, _, _, chunks = process_document(self.file_bytes, self.path.name)
        for c in chunks:
            expected = f"## {c.section}"
            self.assertTrue(
                c.text.startswith(expected),
                f"chunk does not start with breadcrumb {expected!r}; starts with {c.text[:60]!r}",
            )


class RealNewBookingTests(unittest.TestCase):
    """new_booking.md — 8 H1 (шаги воронки + группы). Под каждым H1 свои H2-секции.

    Старое поведение «второй H1 → обрыв» сломало бы этот файл. Тест защищает от
    регрессии: все H2-секции под всеми H1 должны попадать в индекс.
    """

    @classmethod
    def setUpClass(cls):
        path = REPO_ROOT / "data" / "datasets" / "new_booking.md"
        cls.path = path
        cls.file_bytes = path.read_bytes()

    def test_sections_from_multiple_h1_groups(self):
        _, _, sections, chunks = process_document(self.file_bytes, self.path.name)
        # Под каждым H1 есть свои H2 — суммарно должно быть много секций.
        self.assertGreater(len(sections), 10)
        self.assertGreater(len(chunks), 10)
        section_titles = {s.heading for s in sections}
        # Точечные H2 из разных H1-групп должны присутствовать.
        for expected in ("Тон и формулировки", "Шаблон ответа (5 пунктов)", "Текст-завершение"):
            self.assertIn(
                expected,
                section_titles,
                f"section {expected!r} missing — multi-H1 grouping broken?",
            )


if __name__ == "__main__":
    logging.basicConfig(level=logging.WARNING)
    unittest.main()