"""Юнит-тесты чанкера v2 (Спринт 8.5). Проверяет инварианты из docs/CHUNKER_v2_TZ.md: - YAML frontmatter не утекает в чанки. - Только H2 открывает секцию; H3 уходит в тело. - Второй H1 → WARN + обрыв. - Breadcrumb `## {H2}` в каждом subchunk. - Merge и overlap не пересекают границы H2. - Нумерованные списки в md не парсятся как заголовки. - На реальном general_info.md чанк с «Тимпанометрия» имеет section="Направления приёма". Запуск из корня репо: `python -m unittest tests.test_document_processor_v2 -v` """ import logging import sys import unittest from pathlib import Path # Корень репозитория в sys.path, чтобы импортировать services.* без установки пакета. REPO_ROOT = Path(__file__).resolve().parent.parent if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from services.document_processor import ( # noqa: E402 Chunk, ParsedSection, _split_frontmatter, chunk_sections, parse_markdown, process_document, rechunk_raw_text, ) class FrontmatterTests(unittest.TestCase): def test_no_frontmatter_returns_text_as_is(self): text = "# Title\n\n## Sec\n\nbody" fm, body = _split_frontmatter(text) self.assertEqual(fm, {}) self.assertEqual(body, text) def test_yaml_frontmatter_is_stripped(self): text = "---\nintent: general_info\nsources: [a, b]\n---\n\n# Title\n\n## Sec\n\nbody" fm, body = _split_frontmatter(text) self.assertEqual(fm, {"intent": "general_info", "sources": ["a", "b"]}) self.assertTrue(body.startswith("# Title")) self.assertNotIn("intent:", body) self.assertNotIn("---", body.split("\n", 1)[0]) def test_invalid_yaml_falls_back_to_empty_fm(self): text = "---\n: : invalid yaml :\n---\n\n# Title\n## Sec\nbody" fm, body = _split_frontmatter(text) # Парсинг не сломал процесс; frontmatter пустой, тело — не доверяем формату, # поэтому возвращаем исходный текст как body, чтобы не потерять содержимое. self.assertEqual(fm, {}) class MultipleH1Tests(unittest.TestCase): def test_multiple_h1_with_h2_inside_each(self): text = ( "# Step intro\n" "## Tone\n" "intro tone body\n\n" "# Step qualify\n" "## Template\n" "qualify template body\n\n" "## Guards\n" "qualify guards body\n\n" "# Step book\n" "## Confirmation\n" "book confirmation body\n" ) parsed = parse_markdown(text) headings = [s.heading for s in parsed.sections] self.assertEqual(headings, ["Tone", "Template", "Guards", "Confirmation"]) # Тела не перетекают между H1. for s in parsed.sections: if s.heading == "Tone": self.assertIn("intro tone body", s.body) self.assertNotIn("template", s.body.lower()) self.assertNotIn("confirmation", s.body.lower()) if s.heading == "Confirmation": self.assertIn("book confirmation body", s.body) self.assertNotIn("intro", s.body.lower()) def test_h1_without_h2_becomes_section_heading_h1(self): text = ( "# Step close\n" "Closing line one. Closing line two. Closing line three.\n" ) parsed = parse_markdown(text) self.assertEqual(len(parsed.sections), 1) self.assertEqual(parsed.sections[0].heading, "Step close") self.assertIn("Closing line", parsed.sections[0].body) def test_h1_preamble_before_first_h2_is_dropped(self): # Преамбула H1 (тело до первого H2) игнорируется по правилу ТЗ — # обычно это вступление/служебка, дублирующая заголовок. text = ( "# Doc\n" "intro line that must not become a section\n\n" "## Real H2\n" "real h2 body content here.\n" ) parsed = parse_markdown(text) self.assertEqual(len(parsed.sections), 1) self.assertEqual(parsed.sections[0].heading, "Real H2") self.assertNotIn("intro line that must not", parsed.sections[0].body) def test_multiple_h1_no_warnings(self): # Старое поведение — WARN на втором H1 — отозвано. text = "# H1 one\n## A\nbody a\n\n# H1 two\n## B\nbody b\n" with self.assertLogs("services.document_processor", level="WARNING") as cap: parse_markdown(text, source_label="multi.md") # assertLogs требует хотя бы одну запись — добавим dummy, чтобы не упасть, # если их действительно нет. logging.getLogger("services.document_processor").warning("noop-for-assert-logs") warnings_about_h1 = [m for m in cap.output if "second H1" in m] self.assertEqual(warnings_about_h1, []) class H3InBodyTests(unittest.TestCase): def test_h3_does_not_open_new_section(self): text = ( "# Doc\n" "## Doctors\n" "intro line\n\n" "### ENT\n" "Petrov, Ivanov\n\n" "### Allergists\n" "Smirnova\n" ) parsed = parse_markdown(text) self.assertEqual(len(parsed.sections), 1) self.assertEqual(parsed.sections[0].heading, "Doctors") body = parsed.sections[0].body self.assertIn("### ENT", body) self.assertIn("### Allergists", body) self.assertIn("Petrov, Ivanov", body) chunks = chunk_sections(parsed.sections) self.assertEqual(len(chunks), 1) self.assertEqual(chunks[0].section, "Doctors") self.assertTrue(chunks[0].text.startswith("## Doctors\n\n")) self.assertIn("### ENT", chunks[0].text) class NumberedListTests(unittest.TestCase): def test_numbered_list_items_not_treated_as_headings(self): text = ( "# Doc\n" "## Tax certificate\n" "How to receive:\n\n" "1. In person at the clinic — the easiest option.\n" "2. By email — write to mail@clinic.ru.\n" "3. Directly to the tax office.\n" ) parsed = parse_markdown(text) self.assertEqual(len(parsed.sections), 1) self.assertEqual(parsed.sections[0].heading, "Tax certificate") chunks = chunk_sections(parsed.sections) self.assertEqual(len(chunks), 1) # Все три пункта в одном чанке с одной секцией. for needle in ("1. In person", "2. By email", "3. Directly"): self.assertIn(needle, chunks[0].text) # Никакой section не должен начинаться с цифры. for c in chunks: self.assertFalse(c.section[:1].isdigit(), f"section={c.section!r}") class LongSectionSplitTests(unittest.TestCase): def test_long_section_splits_with_breadcrumb_and_same_section(self): para = "Sentence one. Sentence two. Sentence three. " * 8 # ~350 chars text = ( "# Doc\n" "## Big section\n" + "\n\n".join([para] * 6) # ~2 KB body, существенно больше max_chunk_size=1200 + "\n" ) parsed = parse_markdown(text) chunks = chunk_sections(parsed.sections, max_chunk_size=600, min_chunk_size=100, overlap_sentences=0) self.assertGreater(len(chunks), 1) for c in chunks: self.assertEqual(c.section, "Big section") self.assertTrue(c.text.startswith("## Big section\n\n"), f"chunk text starts with: {c.text[:30]!r}") def test_merge_and_overlap_do_not_cross_h2_boundaries(self): # Две короткие секции — merge между ними не должен случиться. text = ( "# Doc\n" "## Alpha\n" "alpha body short.\n\n" "## Beta\n" "beta body short.\n" ) parsed = parse_markdown(text) chunks = chunk_sections(parsed.sections, max_chunk_size=1200, min_chunk_size=500, overlap_sentences=2) sections = sorted({c.section for c in chunks}) self.assertEqual(sections, ["Alpha", "Beta"]) for c in chunks: if c.section == "Alpha": self.assertNotIn("beta body", c.text) self.assertNotIn("## Beta", c.text) else: self.assertNotIn("alpha body", c.text) self.assertNotIn("## Alpha", c.text) class FrontmatterDoesNotLeakTests(unittest.TestCase): def test_frontmatter_not_in_chunks(self): text = ( "---\n" "intent: general_info\n" "secret: do-not-leak-this-token\n" "---\n\n" "# Doc\n" "## Sec\n" "real body line one. real body line two. real body line three.\n" ) # Прогоняем через rechunk_raw_text — это flow реиндексации. chunks = rechunk_raw_text(text) self.assertGreater(len(chunks), 0) for c in chunks: self.assertNotIn("do-not-leak-this-token", c.text) self.assertNotIn("intent: general_info", c.text) # Первый чанк начинается с breadcrumb первого H2. self.assertTrue(chunks[0].text.startswith("## Sec")) class RealGeneralInfoTests(unittest.TestCase): @classmethod def setUpClass(cls): path = REPO_ROOT / "data" / "datasets" / "general_info.md" cls.path = path cls.file_bytes = path.read_bytes() def test_processes_without_errors(self): _, _, sections, chunks = process_document(self.file_bytes, self.path.name) self.assertGreater(len(sections), 5) self.assertGreater(len(chunks), 5) def test_no_chunk_with_empty_or_numeric_section(self): _, _, _, chunks = process_document(self.file_bytes, self.path.name) for c in chunks: self.assertTrue(c.section, f"empty section in chunk: {c.text[:60]!r}") self.assertFalse( c.section[:1].isdigit(), f"section starts with digit: {c.section!r}", ) def test_tympanometry_chunk_lives_in_napravleniya_priema(self): _, _, _, chunks = process_document(self.file_bytes, self.path.name) matches = [c for c in chunks if "тимпанометр" in c.text.lower()] self.assertTrue(matches, "no chunk contains 'тимпанометр' — datasets changed?") for c in matches: self.assertEqual( c.section, "Направления приёма", f"tympanometry chunk has wrong section: {c.section!r}", ) def test_breadcrumb_in_every_chunk(self): _, _, _, chunks = process_document(self.file_bytes, self.path.name) for c in chunks: expected = f"## {c.section}" self.assertTrue( c.text.startswith(expected), f"chunk does not start with breadcrumb {expected!r}; starts with {c.text[:60]!r}", ) class RealNewBookingTests(unittest.TestCase): """new_booking.md — 8 H1 (шаги воронки + группы). Под каждым H1 свои H2-секции. Старое поведение «второй H1 → обрыв» сломало бы этот файл. Тест защищает от регрессии: все H2-секции под всеми H1 должны попадать в индекс. """ @classmethod def setUpClass(cls): path = REPO_ROOT / "data" / "datasets" / "new_booking.md" cls.path = path cls.file_bytes = path.read_bytes() def test_sections_from_multiple_h1_groups(self): _, _, sections, chunks = process_document(self.file_bytes, self.path.name) # Под каждым H1 есть свои H2 — суммарно должно быть много секций. self.assertGreater(len(sections), 10) self.assertGreater(len(chunks), 10) section_titles = {s.heading for s in sections} # Точечные H2 из разных H1-групп должны присутствовать. for expected in ("Тон и формулировки", "Шаблон ответа (5 пунктов)", "Текст-завершение"): self.assertIn( expected, section_titles, f"section {expected!r} missing — multi-H1 grouping broken?", ) if __name__ == "__main__": logging.basicConfig(level=logging.WARNING) unittest.main()