From e534a74460e1a9007a8824ee1679c0ca243a5712 Mon Sep 17 00:00:00 2001
From: AR 15 M4 <alekseyrazorvinm4@MacBook-Air-AR.local>
Date: Thu, 23 Apr 2026 11:15:08 +0500
Subject: [PATCH] =?UTF-8?q?feat(sprint2.5):=20=D1=87=D0=B8=D1=81=D1=82?=
 =?UTF-8?q?=D0=BA=D0=B0=20=D1=87=D0=B0=D0=BD=D0=BA=D0=BE=D0=B2=20=D0=B8=20?=
 =?UTF-8?q?=D0=BF=D0=B5=D1=80=D0=B5=D0=B8=D0=BD=D0=B4=D0=B5=D0=BA=D1=81?=
 =?UTF-8?q?=D0=B0=D1=86=D0=B8=D1=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Чанкер тащил в базу markdown-мусор: навигационные блоки «Вернуться на:»
со списками ссылок, инлайн-ссылки [текст](url) в теле, служебные
пометки _Источник: .../file.md_, лишние пустые строки. Всё это ело
контекст LLM и засоряло правую панель отладки.

- services/text_cleanup: clean_markdown_text — удаляет навигационные
  строки, строки-только-ссылки (обычно это меню), служебные _Источник:_,
  раскрывает инлайн-ссылки [x](url) → x, сжимает 3+ переносов до 2.
- services/document_processor: process_document теперь возвращает
  (id, raw_text, sections, chunks); чистку применяем к заголовкам и
  телам секций; чанки короче 20 символов выбрасываем с пересчётом
  индексов. Вспомогательная rechunk_raw_text — для переиндексации.

Чтобы переиндексировать без повторной загрузки файла, нужен исходный
текст. Вводим отдельный слой:
- новая таблица SQLite documents (id, name, file_type, raw_text,
  created_at, updated_at) + миграция Alembic 7ee7296ccd6d.
- db/models/Document + регистрация в db.models.__init__.
- services/document_service: save/get/list/delete для raw_text.
- routers/documents.upload: сохраняет raw_text в SQLite перед
  индексацией в Chroma; delete убирает и из SQLite, и из Chroma.
- Новые эндпоинты POST /documents/{id}/reindex и
  POST /documents/reindex-all — берут raw_text из SQLite, пропускают
  через rechunk_raw_text, заменяют чанки в Chroma.

Существующие 4 документа были перезалиты вручную (решение: не делать
одноразовый backfill, проще залить заново). Старая Chroma очищена,
новые чанки прошли через чистку — мусор ушёл.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 db/models/__init__.py                         |   3 +-
 db/models/document.py                         |  24 ++++
 ...6ccd6d_add_documents_table_for_raw_text.py |  38 ++++++
 routers/documents.py                          | 109 +++++++++++++++++-
 services/document_processor.py                |  64 +++++++++-
 services/document_service.py                  |  50 ++++++++
 services/text_cleanup.py                      |  39 +++++++
 7 files changed, 316 insertions(+), 11 deletions(-)
 create mode 100644 db/models/document.py
 create mode 100644 migrations/versions/7ee7296ccd6d_add_documents_table_for_raw_text.py
 create mode 100644 services/document_service.py
 create mode 100644 services/text_cleanup.py

diff --git a/db/models/__init__.py b/db/models/__init__.py
index ee77c53..5697071 100644
--- a/db/models/__init__.py
+++ b/db/models/__init__.py
@@ -1,4 +1,5 @@
+from db.models.document import Document
 from db.models.message import Message
 from db.models.thread import Thread
 
-__all__ = ["Thread", "Message"]
+__all__ = ["Thread", "Message", "Document"]
diff --git a/db/models/document.py b/db/models/document.py
new file mode 100644
index 0000000..17bd416
--- /dev/null
+++ b/db/models/document.py
@@ -0,0 +1,24 @@
+from datetime import datetime, timezone
+
+from sqlalchemy import DateTime, String, Text
+from sqlalchemy.orm import Mapped, mapped_column
+
+from db.base import Base
+
+
+def _utcnow() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+class Document(Base):
+    """Исходный текст документа — для переиндексации с новыми правилами чанкера."""
+    __tablename__ = "documents"
+
+    id: Mapped[str] = mapped_column(String(36), primary_key=True)  # UUID из process_document
+    name: Mapped[str] = mapped_column(String(500), nullable=False)
+    file_type: Mapped[str] = mapped_column(String(20), nullable=False)
+    raw_text: Mapped[str] = mapped_column(Text, nullable=False)
+    created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow, nullable=False)
+    updated_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True), default=_utcnow, onupdate=_utcnow, nullable=False
+    )
diff --git a/migrations/versions/7ee7296ccd6d_add_documents_table_for_raw_text.py b/migrations/versions/7ee7296ccd6d_add_documents_table_for_raw_text.py
new file mode 100644
index 0000000..10a5098
--- /dev/null
+++ b/migrations/versions/7ee7296ccd6d_add_documents_table_for_raw_text.py
@@ -0,0 +1,38 @@
+"""add documents table for raw_text
+
+Revision ID: 7ee7296ccd6d
+Revises: e7199587be4b
+Create Date: 2026-04-23 10:56:33.955742
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '7ee7296ccd6d'
+down_revision: Union[str, None] = 'e7199587be4b'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('documents',
+    sa.Column('id', sa.String(length=36), nullable=False),
+    sa.Column('name', sa.String(length=500), nullable=False),
+    sa.Column('file_type', sa.String(length=20), nullable=False),
+    sa.Column('raw_text', sa.Text(), nullable=False),
+    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('documents')
+    # ### end Alembic commands ###
diff --git a/routers/documents.py b/routers/documents.py
index d635738..a05874c 100644
--- a/routers/documents.py
+++ b/routers/documents.py
@@ -1,8 +1,10 @@
 import logging
 from datetime import datetime, timezone
 
-from fastapi import APIRouter, File, Form, HTTPException, UploadFile
+from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
+from sqlalchemy.ext.asyncio import AsyncSession
 
+from db.session import get_session
 from models.responses import (
     ChunkDetail,
     ChunkPreview,
@@ -12,7 +14,8 @@ from models.responses import (
     DocumentListResponse,
     DocumentUploadResponse,
 )
-from services.document_processor import process_document
+from services import document_service
+from services.document_processor import process_document, rechunk_raw_text
 
 logger = logging.getLogger(__name__)
 
@@ -26,6 +29,7 @@ MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB
 async def upload_document(
     file: UploadFile = File(...),
     document_name: str | None = Form(None),
+    session: AsyncSession = Depends(get_session),
 ):
     from main import vectorstore_service
 
@@ -48,7 +52,7 @@ async def upload_document(
 
     display_name = document_name or filename
     try:
-        document_id, sections, chunks = process_document(file_bytes, filename)
+        document_id, raw_text, sections, chunks = process_document(file_bytes, filename)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
@@ -59,6 +63,13 @@ async def upload_document(
         raise HTTPException(status_code=400, detail="No content could be extracted from the document")
 
     file_type = ext.lstrip(".")
+    await document_service.save_document_raw(
+        session=session,
+        document_id=document_id,
+        name=display_name,
+        file_type=file_type,
+        raw_text=raw_text,
+    )
     chunks_count = vectorstore_service.add_document(
         document_id=document_id,
         document_name=display_name,
@@ -144,14 +155,104 @@ async def get_document_chunks(document_id: str):
 
 
 @router.delete("/{document_id}", response_model=DocumentDeleteResponse)
-async def delete_document(document_id: str):
+async def delete_document(document_id: str, session: AsyncSession = Depends(get_session)):
     from main import vectorstore_service
 
     if vectorstore_service is None:
         raise HTTPException(status_code=503, detail="Service not ready")
 
     deleted = vectorstore_service.delete_document(document_id)
+    await document_service.delete_document_raw(session, document_id)
+
     if deleted == 0:
         raise HTTPException(status_code=404, detail="Document not found")
 
     return DocumentDeleteResponse(ok=True, deleted_chunks=deleted)
+
+
+@router.post("/{document_id}/reindex", response_model=DocumentUploadResponse)
+async def reindex_document(document_id: str, session: AsyncSession = Depends(get_session)):
+    """Переразметить документ с актуальными правилами чанкера на основе сохранённого raw_text."""
+    from main import vectorstore_service
+
+    if vectorstore_service is None:
+        raise HTTPException(status_code=503, detail="Service not ready")
+
+    doc = await document_service.get_document_raw(session, document_id)
+    if doc is None:
+        raise HTTPException(status_code=404, detail="Document raw_text not found — reindex невозможен")
+
+    chunks = rechunk_raw_text(doc.raw_text)
+    if not chunks:
+        raise HTTPException(status_code=400, detail="После переразметки не осталось чанков")
+
+    vectorstore_service.delete_document(document_id)
+    chunks_count = vectorstore_service.add_document(
+        document_id=document_id,
+        document_name=doc.name,
+        file_type=doc.file_type,
+        chunks=[
+            {
+                "text": c.text,
+                "section": c.section,
+                "page_number": c.page_number,
+                "chunk_index": c.chunk_index,
+            }
+            for c in chunks
+        ],
+    )
+
+    chunks_prev = [
+        ChunkPreview(
+            index=c.chunk_index,
+            section=c.section,
+            page_number=c.page_number,
+            text_preview=c.text[:300],
+            char_length=len(c.text),
+        )
+        for c in chunks[:3]
+    ]
+
+    return DocumentUploadResponse(
+        document_id=document_id,
+        name=doc.name,
+        chunks_count=chunks_count,
+        status="reindexed",
+        created_at=datetime.now(timezone.utc).isoformat(),
+        chunks_preview=chunks_prev,
+    )
+
+
+@router.post("/reindex-all")
+async def reindex_all(session: AsyncSession = Depends(get_session)):
+    """Переразметить все документы, у которых есть raw_text в SQLite."""
+    from main import vectorstore_service
+
+    if vectorstore_service is None:
+        raise HTTPException(status_code=503, detail="Service not ready")
+
+    docs = await document_service.list_documents_raw(session)
+    results = []
+    for doc in docs:
+        chunks = rechunk_raw_text(doc.raw_text)
+        if not chunks:
+            results.append({"document_id": doc.id, "name": doc.name, "status": "empty"})
+            continue
+        vectorstore_service.delete_document(doc.id)
+        n = vectorstore_service.add_document(
+            document_id=doc.id,
+            document_name=doc.name,
+            file_type=doc.file_type,
+            chunks=[
+                {
+                    "text": c.text,
+                    "section": c.section,
+                    "page_number": c.page_number,
+                    "chunk_index": c.chunk_index,
+                }
+                for c in chunks
+            ],
+        )
+        results.append({"document_id": doc.id, "name": doc.name, "status": "reindexed", "chunks_count": n})
+
+    return {"total": len(results), "results": results}
diff --git a/services/document_processor.py b/services/document_processor.py
index 2659a05..89cedc6 100644
--- a/services/document_processor.py
+++ b/services/document_processor.py
@@ -9,6 +9,9 @@ import fitz  # pymupdf
 from docx import Document as DocxDocument
 
 from config import settings
+from services.text_cleanup import clean_markdown_text
+
+MIN_CHUNK_TEXT_LENGTH = 20  # чанки короче — выбрасываем (обычно это хвосты после чистки)
 
 logger = logging.getLogger(__name__)
 
@@ -276,25 +279,74 @@ def chunk_sections(
 # --- Main processor ---
 
 
-def process_document(file_bytes: bytes, filename: str) -> tuple[str, list[ParsedSection], list[Chunk]]:
+def _sections_to_markdown(sections: list[ParsedSection]) -> str:
+    """Собрать секции в markdown-подобный текст — используется как raw_text для PDF/DOCX,
+    чтобы при переиндексации можно было снова пропустить через parse_text."""
+    parts = []
+    for s in sections:
+        if s.heading:
+            parts.append(f"{'#' * max(1, s.heading_level)} {s.heading}")
+        if s.body:
+            parts.append(s.body)
+    return "\n\n".join(parts).strip()
+
+
+def process_document(
+    file_bytes: bytes, filename: str
+) -> tuple[str, str, list[ParsedSection], list[Chunk]]:
+    """Парсит документ, чистит markdown-мусор, режет на чанки.
+
+    Returns: (document_id, raw_text, sections, chunks)
+    raw_text — очищенный текст, пригодный для переиндексации с новыми правилами.
+    """
     document_id = str(uuid.uuid4())
     ext = Path(filename).suffix.lower()
 
     if ext == ".pdf":
         sections = parse_pdf(file_bytes)
+        raw_text = _sections_to_markdown(sections)
     elif ext in (".docx", ".doc"):
         sections = parse_docx(file_bytes)
+        raw_text = _sections_to_markdown(sections)
     elif ext == ".md":
-        sections = parse_text(file_bytes, is_markdown=True)
+        raw_text = file_bytes.decode("utf-8", errors="replace")
+        cleaned = clean_markdown_text(raw_text)
+        sections = parse_text(cleaned.encode("utf-8"), is_markdown=True)
     elif ext == ".txt":
-        sections = parse_text(file_bytes, is_markdown=False)
+        raw_text = file_bytes.decode("utf-8", errors="replace")
+        sections = parse_text(raw_text.encode("utf-8"), is_markdown=False)
     else:
         raise ValueError(f"Unsupported file format: {ext}")
 
+    # Страховка — чистим секции, даже если в исходнике уже очищали.
+    for s in sections:
+        s.heading = clean_markdown_text(s.heading) if s.heading else ""
+        s.body = clean_markdown_text(s.body)
+    sections = [s for s in sections if s.heading or s.body.strip()]
+
     if not sections:
         logger.warning("No sections found in %s", filename)
-        return document_id, [], []
+        return document_id, raw_text, [], []
 
     chunks = chunk_sections(sections)
-    logger.info("Processed '%s': %d sections → %d chunks", filename, len(sections), len(chunks))
-    return document_id, sections, chunks
+    # Отбрасываем пустые и совсем мелкие хвосты; переиндексируем.
+    chunks = [c for c in chunks if len(c.text.strip()) >= MIN_CHUNK_TEXT_LENGTH]
+    for i, c in enumerate(chunks):
+        c.chunk_index = i
+    logger.info("Processed '%s': %d sections → %d chunks (cleaned)", filename, len(sections), len(chunks))
+    return document_id, raw_text, sections, chunks
+
+
+def rechunk_raw_text(raw_text: str) -> list[Chunk]:
+    """Для переиндексации: режем сохранённый текст с актуальными правилами чистки."""
+    cleaned = clean_markdown_text(raw_text)
+    sections = parse_text(cleaned.encode("utf-8"), is_markdown=True)
+    for s in sections:
+        s.heading = clean_markdown_text(s.heading) if s.heading else ""
+        s.body = clean_markdown_text(s.body)
+    sections = [s for s in sections if s.heading or s.body.strip()]
+    chunks = chunk_sections(sections)
+    chunks = [c for c in chunks if len(c.text.strip()) >= MIN_CHUNK_TEXT_LENGTH]
+    for i, c in enumerate(chunks):
+        c.chunk_index = i
+    return chunks
diff --git a/services/document_service.py b/services/document_service.py
new file mode 100644
index 0000000..a91d8c3
--- /dev/null
+++ b/services/document_service.py
@@ -0,0 +1,50 @@
+"""SQLite-слой для raw-текстов документов — для переиндексации."""
+import logging
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from db.models import Document
+
+logger = logging.getLogger(__name__)
+
+
+async def save_document_raw(
+    session: AsyncSession,
+    document_id: str,
+    name: str,
+    file_type: str,
+    raw_text: str,
+) -> None:
+    """Сохранить (или перезаписать) исходный текст документа в SQLite."""
+    existing = await session.get(Document, document_id)
+    if existing:
+        existing.name = name
+        existing.file_type = file_type
+        existing.raw_text = raw_text
+    else:
+        session.add(Document(
+            id=document_id,
+            name=name,
+            file_type=file_type,
+            raw_text=raw_text,
+        ))
+    await session.commit()
+
+
+async def get_document_raw(session: AsyncSession, document_id: str) -> Document | None:
+    return await session.get(Document, document_id)
+
+
+async def list_documents_raw(session: AsyncSession) -> list[Document]:
+    stmt = select(Document).order_by(Document.created_at)
+    return list((await session.execute(stmt)).scalars().all())
+
+
+async def delete_document_raw(session: AsyncSession, document_id: str) -> bool:
+    doc = await session.get(Document, document_id)
+    if doc is None:
+        return False
+    await session.delete(doc)
+    await session.commit()
+    return True
diff --git a/services/text_cleanup.py b/services/text_cleanup.py
new file mode 100644
index 0000000..b5261f6
--- /dev/null
+++ b/services/text_cleanup.py
@@ -0,0 +1,39 @@
+"""Чистка wiki-текстов от навигационного и служебного markdown-мусора."""
+import re
+
+RETURN_TO_RE = re.compile(r"^\*\*\s*Вернуться на\s*:?\s*\*\*\s*:?$")
+LINK_ONLY_RE = re.compile(r"^\[[^\]\n]+\]\([^\)\n]+\)$")
+SOURCE_NOTE_RE = re.compile(r"^_Источник\s*:.*_$")
+INLINE_LINK_RE = re.compile(r"\[([^\]\n]+)\]\([^\)\n]+\)")
+MULTI_BLANK_RE = re.compile(r"\n{3,}")
+
+
+def clean_markdown_text(text: str) -> str:
+    """Удаляет навигационный мусор и раскрывает инлайн-ссылки.
+
+    Правила:
+    - Строка `**Вернуться на:**` — выбрасывается.
+    - Строка, целиком состоящая из markdown-ссылки `[x](url)` — выбрасывается (это навигация).
+    - Строка `_Источник: .../file.md_` — выбрасывается.
+    - Инлайн-ссылки в теле `[текст](url)` заменяются на `текст`.
+    - 3+ подряд переносов строк сжимаются до 2.
+    """
+    if not text:
+        return ""
+
+    lines = text.split("\n")
+    cleaned_lines = []
+    for line in lines:
+        stripped = line.strip()
+        if RETURN_TO_RE.match(stripped):
+            continue
+        if LINK_ONLY_RE.match(stripped):
+            continue
+        if SOURCE_NOTE_RE.match(stripped):
+            continue
+        cleaned_lines.append(line)
+
+    text = "\n".join(cleaned_lines)
+    text = INLINE_LINK_RE.sub(r"\1", text)
+    text = MULTI_BLANK_RE.sub("\n\n", text)
+    return text.strip()