RAG_helper/services/document_processor.py

import io
import logging
import re
import uuid
from dataclasses import dataclass, field
from pathlib import Path

import fitz  # pymupdf
import yaml
from docx import Document as DocxDocument
from markdown_it import MarkdownIt

from config import settings
from services.text_cleanup import clean_markdown_text

MIN_CHUNK_TEXT_LENGTH = 20  # чанки короче — выбрасываем (обычно это хвосты после чистки)

logger = logging.getLogger(__name__)


@dataclass
class ParsedSection:
    heading: str
    heading_level: int
    body: str
    page_number: int = 0


@dataclass
class Chunk:
    text: str
    section: str = ""
    page_number: int = 0
    chunk_index: int = 0


@dataclass
class ParsedMarkdown:
    """Результат парсинга md: frontmatter (если был) + список H2-секций."""
    frontmatter: dict = field(default_factory=dict)
    sections: list[ParsedSection] = field(default_factory=list)


# --- Parsers ---


def parse_pdf(file_bytes: bytes) -> list[ParsedSection]:
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    sections: list[ParsedSection] = []
    current_heading = ""
    current_body_lines: list[str] = []
    current_page = 0

    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                text = "".join(span["text"] for span in line["spans"]).strip()
                if not text:
                    continue

                max_size = max(span["size"] for span in line["spans"])
                is_bold = any("bold" in span["font"].lower() for span in line["spans"])

                if (max_size >= 14 or (is_bold and max_size >= 12)) and len(text) < 200:
                    if current_heading or current_body_lines:
                        sections.append(ParsedSection(
                            heading=current_heading,
                            heading_level=1 if max_size >= 16 else 2,
                            body="\n".join(current_body_lines).strip(),
                            page_number=current_page,
                        ))
                    current_heading = text
                    current_body_lines = []
                    current_page = page_num + 1
                else:
                    current_body_lines.append(text)
                    if not current_heading:
                        current_page = page_num + 1

    if current_heading or current_body_lines:
        sections.append(ParsedSection(
            heading=current_heading,
            heading_level=2,
            body="\n".join(current_body_lines).strip(),
            page_number=current_page,
        ))

    doc.close()
    return sections


def parse_docx(file_bytes: bytes) -> list[ParsedSection]:
    doc = DocxDocument(io.BytesIO(file_bytes))
    sections: list[ParsedSection] = []
    current_heading = ""
    current_level = 0
    current_body_lines: list[str] = []

    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        style_name = (para.style.name or "").lower()

        if "heading" in style_name or "title" in style_name:
            if current_heading or current_body_lines:
                sections.append(ParsedSection(
                    heading=current_heading,
                    heading_level=current_level or 1,
                    body="\n".join(current_body_lines).strip(),
                ))
            level_match = re.search(r"\d+", style_name)
            current_level = int(level_match.group()) if level_match else 1
            current_heading = text
            current_body_lines = []
        else:
            current_body_lines.append(text)

    if current_heading or current_body_lines:
        sections.append(ParsedSection(
            heading=current_heading,
            heading_level=current_level or 1,
            body="\n".join(current_body_lines).strip(),
        ))

    return sections


def parse_text(file_bytes: bytes, is_markdown: bool = False) -> list[ParsedSection]:
    """Parse wiki-style TXT/MD.

    Эвристики под wiki операторов:
    - markdown-заголовки (#, ##, ...)
    - нумерованные пункты «1.», «1.1.», «1.1.1.»
    - FAQ-паттерн «В:» / «Вопрос:» — воспринимаем как начало новой секции
    - ALL-CAPS строки (короткие) — заголовок
    """
    text = file_bytes.decode("utf-8", errors="replace")
    lines = text.split("\n")
    sections: list[ParsedSection] = []
    current_heading = ""
    current_level = 0
    current_body_lines: list[str] = []

    md_heading_re = re.compile(r"^(#{1,6})\s+(.+)")
    numbered_heading_re = re.compile(r"^(\d+(?:\.\d+)*\.?)\s+([А-ЯЁA-Z].*)")
    faq_question_re = re.compile(r"^(В|Вопрос|Q|Question)\s*[:\.]\s*(.+)", re.IGNORECASE)

    for line in lines:
        stripped = line.strip()

        heading_text = None
        heading_level = 0

        md_match = md_heading_re.match(stripped)
        if md_match:
            heading_level = len(md_match.group(1))
            heading_text = md_match.group(2).strip()

        if not heading_text:
            num_match = numbered_heading_re.match(stripped)
            if num_match and len(stripped) < 200:
                dots = num_match.group(1).count(".")
                heading_level = max(1, dots + 1)
                heading_text = stripped

        if not heading_text:
            faq_match = faq_question_re.match(stripped)
            if faq_match and len(stripped) < 300:
                heading_text = faq_match.group(2).strip()
                heading_level = 3

        if not heading_text and stripped.isupper() and 3 < len(stripped) < 200:
            heading_text = stripped
            heading_level = 1

        if heading_text:
            if current_heading or current_body_lines:
                sections.append(ParsedSection(
                    heading=current_heading,
                    heading_level=current_level or 1,
                    body="\n".join(current_body_lines).strip(),
                ))
            current_heading = heading_text
            current_level = heading_level
            current_body_lines = []
        else:
            current_body_lines.append(line)

    if current_heading or current_body_lines:
        sections.append(ParsedSection(
            heading=current_heading,
            heading_level=current_level or 1,
            body="\n".join(current_body_lines).strip(),
        ))

    return sections


# --- Markdown parser v2 (иерархия H1/H2/H3, frontmatter, второй H1 → cut) ---


def _split_frontmatter(text: str) -> tuple[dict, str]:
    """Если файл начинается со строки `---`, отрезает YAML-frontmatter и парсит его.

    Возвращает (frontmatter_dict, body_text). Если frontmatter не найден или невалиден —
    словарь пустой, body_text == исходный text.
    """
    if not text.startswith("---"):
        return {}, text
    lines = text.split("\n")
    if not lines or lines[0].strip() != "---":
        return {}, text
    end = -1
    for i in range(1, len(lines)):
        if lines[i].strip() == "---":
            end = i
            break
    if end == -1:
        return {}, text
    fm_text = "\n".join(lines[1:end])
    body = "\n".join(lines[end + 1:]).lstrip("\n")
    try:
        fm = yaml.safe_load(fm_text) or {}
    except yaml.YAMLError as exc:
        logger.warning("Failed to parse YAML frontmatter: %s", exc)
        return {}, text
    if not isinstance(fm, dict):
        return {}, body
    return fm, body


def parse_markdown(text: str, source_label: str = "") -> ParsedMarkdown:
    """Парсер markdown с иерархией H1/H2/H3.

    Правила (см. docs/CHUNKER_v2_TZ.md):
    - YAML frontmatter (`--- ... ---` в самом начале) — отрезается, в текст не идёт.
    - Каждый H1 — это «корневая секция» документа. Множественные H1 поддерживаются
      штатно (например, `new_booking.md` имеет 8 H1 — шаги воронки).
    - Внутри H1 каждый H2 открывает свою секцию (heading H2 → ParsedSection.heading).
      Преамбула H1 (тело между H1 и его первым H2) игнорируется — обычно это служебка
      или вступление, дублирующее заголовок.
    - Если внутри H1 нет ни одного H2 — H1 сам становится одной секцией с heading H1
      и body = всё его содержимое.
    - H3 и ниже не открывают секций — идут в тело текущей H2 как есть (`### {текст}`).
    """
    frontmatter, body_text = _split_frontmatter(text)
    _ = source_label  # пока используется только потенциально для будущих логов

    md = MarkdownIt("commonmark")
    tokens = md.parse(body_text)
    lines = body_text.split("\n")
    total_lines = len(lines)

    # (level, start_line, body_start_line, heading_text)
    headings: list[tuple[int, int, int, str]] = []
    i = 0
    while i < len(tokens):
        t = tokens[i]
        if t.type == "heading_open" and t.map is not None:
            level = int(t.tag[1])
            inline = tokens[i + 1] if i + 1 < len(tokens) else None
            heading_text = inline.content.strip() if inline is not None else ""
            headings.append((level, t.map[0], t.map[1], heading_text))
        i += 1

    h1_positions = [idx for idx, h in enumerate(headings) if h[0] == 1]
    sections: list[ParsedSection] = []

    # Файл без H1 — старый кейс (только H2). Обрабатываем как «один виртуальный H1»
    # с диапазоном на весь файл, чтобы не ветвиться.
    if not h1_positions:
        h1_groups = [(0, total_lines, "", -1)]  # (h1_body_start, h1_end_line, h1_heading, h1_idx)
    else:
        h1_groups = []
        for pos_idx, h_idx in enumerate(h1_positions):
            _, _, h1_body_start, h1_text = headings[h_idx]
            if pos_idx + 1 < len(h1_positions):
                h1_end = headings[h1_positions[pos_idx + 1]][1]
            else:
                h1_end = total_lines
            h1_groups.append((h1_body_start, h1_end, h1_text, h_idx))

    for h1_body_start, h1_end, h1_heading, h1_idx in h1_groups:
        # H2-индексы внутри текущего H1-диапазона.
        h2_indices_in_group = [
            idx for idx, h in enumerate(headings)
            if h[0] == 2 and h1_body_start <= h[1] < h1_end
        ]

        if h2_indices_in_group:
            for pos, idx in enumerate(h2_indices_in_group):
                _, _, body_start, heading_text = headings[idx]
                if pos + 1 < len(h2_indices_in_group):
                    body_end = headings[h2_indices_in_group[pos + 1]][1]
                else:
                    body_end = h1_end
                body = "\n".join(lines[body_start:body_end]).strip()
                sections.append(ParsedSection(
                    heading=heading_text,
                    heading_level=2,
                    body=body,
                ))
        else:
            # H1 без H2 — body H1 идёт одной секцией с heading=H1.
            body = "\n".join(lines[h1_body_start:h1_end]).strip()
            if body or h1_heading:
                sections.append(ParsedSection(
                    heading=h1_heading,
                    heading_level=1 if h1_heading else 2,
                    body=body,
                ))

    return ParsedMarkdown(frontmatter=frontmatter, sections=sections)


# --- Chunker ---


def _split_sentences(text: str) -> list[str]:
    sentences = re.split(r"(?<=[.!?])\s+", text)
    return [s.strip() for s in sentences if s.strip()]


def chunk_sections(
    sections: list[ParsedSection],
    max_chunk_size: int | None = None,
    min_chunk_size: int | None = None,
    overlap_sentences: int | None = None,
) -> list[Chunk]:
    """Чанкинг секций с инвариантом «один чанк ⊆ одна H2-секция».

    Ключевые правила (см. docs/CHUNKER_v2_TZ.md):
    - Внутри секции разрезаем тело по абзацам (`\\n\\n`).
    - В каждом subchunk-е первая строка — breadcrumb `## {heading H2}`.
    - Merge коротких хвостов и sentence-overlap работают только внутри одной секции.
    - Секции с heading из `excluded_section_headings` пропускаются.
    - Секции с пустым heading (PDF/DOCX без заголовка) индексируются без breadcrumb,
      чтобы не терять контент при reindex наследия. Для md-входа таких не бывает.
    """
    max_size = max_chunk_size or settings.max_chunk_size
    min_size = min_chunk_size or settings.min_chunk_size
    overlap = overlap_sentences or settings.overlap_sentences
    excluded = set(settings.excluded_section_headings or [])

    final: list[Chunk] = []

    for section in sections:
        if section.heading and section.heading in excluded:
            continue
        body = section.body.strip()
        if not body and not section.heading:
            continue

        breadcrumb = f"## {section.heading}" if section.heading else ""
        if breadcrumb:
            full_text = f"{breadcrumb}\n\n{body}" if body else breadcrumb
        else:
            full_text = body

        if len(full_text) <= max_size:
            section_chunks = [full_text]
        else:
            paragraphs = [p.strip() for p in re.split(r"\n{2,}", body) if p.strip()]
            section_chunks = []
            current = breadcrumb
            for para in paragraphs:
                # Стоимость склейки: текущий + "\n\n" + para.
                projected = (current + "\n\n" + para) if current else para
                if len(projected) > max_size and current and current != breadcrumb:
                    section_chunks.append(current)
                    current = f"{breadcrumb}\n\n{para}" if breadcrumb else para
                else:
                    current = projected
            if current and current != breadcrumb:
                section_chunks.append(current)

        # Merge коротких хвостов — только внутри одной секции.
        merged: list[str] = []
        for ch in section_chunks:
            if merged and len(merged[-1]) < min_size:
                merged[-1] = merged[-1] + "\n\n" + ch
            else:
                merged.append(ch)

        # Sentence-overlap — только между subchunk'ами одной секции.
        if overlap > 0 and len(merged) > 1:
            with_overlap = [merged[0]]
            for i in range(1, len(merged)):
                prev_sentences = _split_sentences(merged[i - 1])
                overlap_text = " ".join(prev_sentences[-overlap:])
                if not overlap_text or overlap_text in merged[i]:
                    with_overlap.append(merged[i])
                    continue
                cur = merged[i]
                # Вставляем overlap после breadcrumb-строки, чтобы заголовок остался первой строкой.
                if breadcrumb and cur.startswith(breadcrumb + "\n\n"):
                    rest = cur[len(breadcrumb) + 2:]
                    new_text = f"{breadcrumb}\n\n{overlap_text}\n\n{rest}"
                else:
                    new_text = f"{overlap_text}\n\n{cur}"
                with_overlap.append(new_text)
            merged = with_overlap

        for ch_text in merged:
            final.append(Chunk(
                text=ch_text.strip(),
                section=section.heading,
                page_number=section.page_number,
            ))

    for i, c in enumerate(final):
        c.chunk_index = i

    return final


# --- Main processor ---


def _sections_to_markdown(sections: list[ParsedSection]) -> str:
    """Собрать секции в markdown-подобный текст для повторной нарезки.

    Все секции пишем как H2 — это нормализует выгрузки PDF/DOCX, где `heading_level`
    может быть 1 или 2. Иначе reindex через `parse_markdown` потерял бы контент:
    одиночный H1 трактуется как корень документа, второй H1 → WARN-обрыв.
    """
    parts = []
    for s in sections:
        if s.heading:
            parts.append(f"## {s.heading}")
        if s.body:
            parts.append(s.body)
    return "\n\n".join(parts).strip()


def process_document(
    file_bytes: bytes, filename: str
) -> tuple[str, str, list[ParsedSection], list[Chunk]]:
    """Парсит документ, чистит markdown-мусор, режет на чанки.

    Returns: (document_id, raw_text, sections, chunks)
    raw_text — очищенный текст, пригодный для переиндексации с новыми правилами.
    """
    document_id = str(uuid.uuid4())
    ext = Path(filename).suffix.lower()

    if ext == ".pdf":
        sections = parse_pdf(file_bytes)
        raw_text = _sections_to_markdown(sections)
    elif ext in (".docx", ".doc"):
        sections = parse_docx(file_bytes)
        raw_text = _sections_to_markdown(sections)
    elif ext == ".md":
        raw_text = file_bytes.decode("utf-8", errors="replace")
        _, body_text = _split_frontmatter(raw_text)
        cleaned = clean_markdown_text(body_text)
        sections = parse_markdown(cleaned, source_label=filename).sections
    elif ext == ".txt":
        raw_text = file_bytes.decode("utf-8", errors="replace")
        sections = parse_text(raw_text.encode("utf-8"), is_markdown=False)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    # Страховка — чистим секции, даже если в исходнике уже очищали.
    for s in sections:
        s.heading = clean_markdown_text(s.heading) if s.heading else ""
        s.body = clean_markdown_text(s.body)
    sections = [s for s in sections if s.heading or s.body.strip()]

    if not sections:
        logger.warning("No sections found in %s", filename)
        return document_id, raw_text, [], []

    chunks = chunk_sections(sections)
    # Отбрасываем пустые и совсем мелкие хвосты; переиндексируем.
    chunks = [c for c in chunks if len(c.text.strip()) >= MIN_CHUNK_TEXT_LENGTH]
    for i, c in enumerate(chunks):
        c.chunk_index = i
    logger.info("Processed '%s': %d sections → %d chunks (cleaned)", filename, len(sections), len(chunks))
    return document_id, raw_text, sections, chunks


def rechunk_raw_text(raw_text: str) -> list[Chunk]:
    """Для переиндексации: режем сохранённый текст с актуальными правилами чистки."""
    _, body_text = _split_frontmatter(raw_text)
    cleaned = clean_markdown_text(body_text)
    sections = parse_markdown(cleaned).sections
    for s in sections:
        s.heading = clean_markdown_text(s.heading) if s.heading else ""
        s.body = clean_markdown_text(s.body)
    sections = [s for s in sections if s.heading or s.body.strip()]
    chunks = chunk_sections(sections)
    chunks = [c for c in chunks if len(c.text.strip()) >= MIN_CHUNK_TEXT_LENGTH]
    for i, c in enumerate(chunks):
        c.chunk_index = i
    return chunks