feat: Спринт 1 — RAG-ядро, загрузка wiki и Debug UI

FastAPI + ChromaDB + E5-large + DeepSeek по паттерну work-pcs-dr-cdss, адаптированному под пациентский контекст: - services: embeddings (E5-large с префиксами), vectorstore (коллекция operators_wiki), document_processor (PDF/DOCX/TXT/MD + чанкер с FAQ- паттерном под wiki), llm_client (системный промпт ассистента клиники), rag_pipeline (одиночный вопрос → retrieval → ответ). - routers: /health, /documents (upload, list, chunks, delete), /query. - static/index.html: шапка со статусом, блок базы знаний с раскрытием чанков по клику, блок тест-вопроса с 3-колоночным ответом (чанки со score / собранный промпт / ответ LLM). - Порт 8003 (8001 занят CDSS, 8002 — voicenote). E2E проверен: загрузка wiki_test.md → 2 чанка, вопрос «как записать ребёнка к лору?» → top score 84.8%, корректный ответ DeepSeek. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 14:57:34 +05:00
parent d1e7749605
commit a7f78d71b2
21 changed files with 1641 additions and 0 deletions
@@ -0,0 +1,300 @@
+import io
+import logging
+import re
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+
+import fitz  # pymupdf
+from docx import Document as DocxDocument
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ParsedSection:
+    heading: str
+    heading_level: int
+    body: str
+    page_number: int = 0
+
+
+@dataclass
+class Chunk:
+    text: str
+    section: str = ""
+    page_number: int = 0
+    chunk_index: int = 0
+
+
+# --- Parsers ---
+
+
+def parse_pdf(file_bytes: bytes) -> list[ParsedSection]:
+    doc = fitz.open(stream=file_bytes, filetype="pdf")
+    sections: list[ParsedSection] = []
+    current_heading = ""
+    current_body_lines: list[str] = []
+    current_page = 0
+
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        blocks = page.get_text("dict")["blocks"]
+
+        for block in blocks:
+            if "lines" not in block:
+                continue
+            for line in block["lines"]:
+                text = "".join(span["text"] for span in line["spans"]).strip()
+                if not text:
+                    continue
+
+                max_size = max(span["size"] for span in line["spans"])
+                is_bold = any("bold" in span["font"].lower() for span in line["spans"])
+
+                if (max_size >= 14 or (is_bold and max_size >= 12)) and len(text) < 200:
+                    if current_heading or current_body_lines:
+                        sections.append(ParsedSection(
+                            heading=current_heading,
+                            heading_level=1 if max_size >= 16 else 2,
+                            body="\n".join(current_body_lines).strip(),
+                            page_number=current_page,
+                        ))
+                    current_heading = text
+                    current_body_lines = []
+                    current_page = page_num + 1
+                else:
+                    current_body_lines.append(text)
+                    if not current_heading:
+                        current_page = page_num + 1
+
+    if current_heading or current_body_lines:
+        sections.append(ParsedSection(
+            heading=current_heading,
+            heading_level=2,
+            body="\n".join(current_body_lines).strip(),
+            page_number=current_page,
+        ))
+
+    doc.close()
+    return sections
+
+
+def parse_docx(file_bytes: bytes) -> list[ParsedSection]:
+    doc = DocxDocument(io.BytesIO(file_bytes))
+    sections: list[ParsedSection] = []
+    current_heading = ""
+    current_level = 0
+    current_body_lines: list[str] = []
+
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if not text:
+            continue
+
+        style_name = (para.style.name or "").lower()
+
+        if "heading" in style_name or "title" in style_name:
+            if current_heading or current_body_lines:
+                sections.append(ParsedSection(
+                    heading=current_heading,
+                    heading_level=current_level or 1,
+                    body="\n".join(current_body_lines).strip(),
+                ))
+            level_match = re.search(r"\d+", style_name)
+            current_level = int(level_match.group()) if level_match else 1
+            current_heading = text
+            current_body_lines = []
+        else:
+            current_body_lines.append(text)
+
+    if current_heading or current_body_lines:
+        sections.append(ParsedSection(
+            heading=current_heading,
+            heading_level=current_level or 1,
+            body="\n".join(current_body_lines).strip(),
+        ))
+
+    return sections
+
+
+def parse_text(file_bytes: bytes, is_markdown: bool = False) -> list[ParsedSection]:
+    """Parse wiki-style TXT/MD.
+
+    Эвристики под wiki операторов:
+    - markdown-заголовки (#, ##, ...)
+    - нумерованные пункты «1.», «1.1.», «1.1.1.»
+    - FAQ-паттерн «В:» / «Вопрос:» — воспринимаем как начало новой секции
+    - ALL-CAPS строки (короткие) — заголовок
+    """
+    text = file_bytes.decode("utf-8", errors="replace")
+    lines = text.split("\n")
+    sections: list[ParsedSection] = []
+    current_heading = ""
+    current_level = 0
+    current_body_lines: list[str] = []
+
+    md_heading_re = re.compile(r"^(#{1,6})\s+(.+)")
+    numbered_heading_re = re.compile(r"^(\d+(?:\.\d+)*\.?)\s+([А-ЯЁA-Z].*)")
+    faq_question_re = re.compile(r"^(В|Вопрос|Q|Question)\s*[:\.]\s*(.+)", re.IGNORECASE)
+
+    for line in lines:
+        stripped = line.strip()
+
+        heading_text = None
+        heading_level = 0
+
+        md_match = md_heading_re.match(stripped)
+        if md_match:
+            heading_level = len(md_match.group(1))
+            heading_text = md_match.group(2).strip()
+
+        if not heading_text:
+            num_match = numbered_heading_re.match(stripped)
+            if num_match and len(stripped) < 200:
+                dots = num_match.group(1).count(".")
+                heading_level = max(1, dots + 1)
+                heading_text = stripped
+
+        if not heading_text:
+            faq_match = faq_question_re.match(stripped)
+            if faq_match and len(stripped) < 300:
+                heading_text = faq_match.group(2).strip()
+                heading_level = 3
+
+        if not heading_text and stripped.isupper() and 3 < len(stripped) < 200:
+            heading_text = stripped
+            heading_level = 1
+
+        if heading_text:
+            if current_heading or current_body_lines:
+                sections.append(ParsedSection(
+                    heading=current_heading,
+                    heading_level=current_level or 1,
+                    body="\n".join(current_body_lines).strip(),
+                ))
+            current_heading = heading_text
+            current_level = heading_level
+            current_body_lines = []
+        else:
+            current_body_lines.append(line)
+
+    if current_heading or current_body_lines:
+        sections.append(ParsedSection(
+            heading=current_heading,
+            heading_level=current_level or 1,
+            body="\n".join(current_body_lines).strip(),
+        ))
+
+    return sections
+
+
+# --- Chunker ---
+
+
+def _split_sentences(text: str) -> list[str]:
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    return [s.strip() for s in sentences if s.strip()]
+
+
+def chunk_sections(
+    sections: list[ParsedSection],
+    max_chunk_size: int | None = None,
+    min_chunk_size: int | None = None,
+    overlap_sentences: int | None = None,
+) -> list[Chunk]:
+    """Чанкинг wiki-секций.
+
+    - Малые секции (FAQ-ответы) держим целиком — один чанк = одна тема.
+    - Большие секции (регламенты) режем по абзацам, с overlap последних N предложений.
+    - Мелкие соседние секции склеиваем, чтобы не плодить огрызки.
+    """
+    max_size = max_chunk_size or settings.max_chunk_size
+    min_size = min_chunk_size or settings.min_chunk_size
+    overlap = overlap_sentences or settings.overlap_sentences
+
+    raw_chunks: list[Chunk] = []
+
+    for section in sections:
+        heading_prefix = f"{section.heading}\n\n" if section.heading else ""
+        full_text = heading_prefix + section.body
+
+        if len(full_text) <= max_size:
+            raw_chunks.append(Chunk(
+                text=full_text.strip(),
+                section=section.heading,
+                page_number=section.page_number,
+            ))
+        else:
+            paragraphs = section.body.split("\n")
+            current_text = heading_prefix
+            for para in paragraphs:
+                if len(current_text) + len(para) + 1 > max_size and len(current_text) > len(heading_prefix):
+                    raw_chunks.append(Chunk(
+                        text=current_text.strip(),
+                        section=section.heading,
+                        page_number=section.page_number,
+                    ))
+                    current_text = heading_prefix + para + "\n"
+                else:
+                    current_text += para + "\n"
+            if current_text.strip() and current_text.strip() != heading_prefix.strip():
+                raw_chunks.append(Chunk(
+                    text=current_text.strip(),
+                    section=section.heading,
+                    page_number=section.page_number,
+                ))
+
+    merged: list[Chunk] = []
+    for chunk in raw_chunks:
+        if merged and len(merged[-1].text) < min_size:
+            merged[-1].text += "\n\n" + chunk.text
+            if not merged[-1].section:
+                merged[-1].section = chunk.section
+        else:
+            merged.append(Chunk(
+                text=chunk.text,
+                section=chunk.section,
+                page_number=chunk.page_number,
+            ))
+
+    final: list[Chunk] = []
+    for i, chunk in enumerate(merged):
+        if i > 0 and overlap > 0:
+            prev_sentences = _split_sentences(merged[i - 1].text)
+            overlap_text = " ".join(prev_sentences[-overlap:])
+            if overlap_text and overlap_text not in chunk.text:
+                chunk.text = overlap_text + "\n\n" + chunk.text
+        chunk.chunk_index = i
+        final.append(chunk)
+
+    return final
+
+
+# --- Main processor ---
+
+
+def process_document(file_bytes: bytes, filename: str) -> tuple[str, list[ParsedSection], list[Chunk]]:
+    document_id = str(uuid.uuid4())
+    ext = Path(filename).suffix.lower()
+
+    if ext == ".pdf":
+        sections = parse_pdf(file_bytes)
+    elif ext in (".docx", ".doc"):
+        sections = parse_docx(file_bytes)
+    elif ext == ".md":
+        sections = parse_text(file_bytes, is_markdown=True)
+    elif ext == ".txt":
+        sections = parse_text(file_bytes, is_markdown=False)
+    else:
+        raise ValueError(f"Unsupported file format: {ext}")
+
+    if not sections:
+        logger.warning("No sections found in %s", filename)
+        return document_id, [], []
+
+    chunks = chunk_sections(sections)
+    logger.info("Processed '%s': %d sections → %d chunks", filename, len(sections), len(chunks))
+    return document_id, sections, chunks
@@ -0,0 +1,22 @@
+import logging
+
+from sentence_transformers import SentenceTransformer
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingService:
+    def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
+        logger.info("Loading embedding model: %s", model_name)
+        self.model = SentenceTransformer(model_name)
+        self.model_name = model_name
+        logger.info("Embedding model loaded (dim=%d)", self.model.get_sentence_embedding_dimension())
+
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        prefixed = [f"passage: {t}" for t in texts]
+        embeddings = self.model.encode(prefixed, normalize_embeddings=True, show_progress_bar=False)
+        return embeddings.tolist()
+
+    def embed_query(self, query: str) -> list[float]:
+        embedding = self.model.encode(f"query: {query}", normalize_embeddings=True)
+        return embedding.tolist()
@@ -0,0 +1,104 @@
+import logging
+
+import httpx
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SYSTEM_PROMPT = """Ты — виртуальный ассистент клиники, который первым отвечает пациентам в чате.
+
+Твоя задача — помочь пациенту по бытовым и организационным вопросам: запись, расписание врачей, подготовка к приёму, как проехать, документы, оплата, ДМС, детский приём и т. п.
+
+Правила:
+- Отвечай коротко, дружелюбно, на «вы», простым русским языком без медицинской латыни.
+- Опирайся ТОЛЬКО на предоставленные выдержки из базы знаний. Если ответа в них нет — честно скажи, что уточнишь у оператора, и предложи подключить оператора.
+- Не ставь диагнозы и не назначай лечение. Если вопрос про симптомы, лекарства, дозировки или «что со мной» — мягко предложи записаться к врачу и подключить оператора, если нужно.
+- Не выдумывай телефоны, адреса, цены, имена врачей, расписание. Только из источников.
+- Если пациент просит оператора — коротко подтверди, что сейчас его подключишь.
+- Источники указывать не нужно: пациент их не видит. Ответ — обычный текст, как в чате."""
+
+DEFAULT_USER_TEMPLATE = """Вопрос пациента:
+{question}
+
+Выдержки из базы знаний операторов:
+{sources}
+
+Ответь пациенту в чате по правилам из системного сообщения."""
+
+
+class LLMClient:
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model: str | None = None,
+        base_url: str | None = None,
+    ):
+        self.api_key = api_key or settings.deepseek_api_key
+        self.model = model or settings.deepseek_model
+        self.base_url = (base_url or settings.deepseek_base_url).rstrip("/")
+
+    def _format_sources(self, sources: list[dict]) -> str:
+        if not sources:
+            return "(источники не найдены)"
+        lines = []
+        for i, src in enumerate(sources, 1):
+            meta = src.get("metadata", {})
+            doc_name = meta.get("document_name", "Документ")
+            section = meta.get("section", "")
+            lines.append(
+                f"[{i}] {src['text']}\n"
+                f"    (Источник: {doc_name}, раздел: {section})"
+            )
+        return "\n".join(lines)
+
+    async def answer(
+        self,
+        question: str,
+        sources: list[dict],
+        system_prompt: str | None = None,
+        temperature: float | None = None,
+        max_tokens: int | None = None,
+    ) -> dict:
+        """Generate a patient-facing answer using RAG context.
+
+        Returns dict with 'text' and 'assembled_prompt'.
+        """
+        effective_system = system_prompt or DEFAULT_SYSTEM_PROMPT
+        effective_temp = temperature if temperature is not None else 0.2
+        effective_max_tokens = max_tokens or 1200
+
+        formatted_sources = self._format_sources(sources)
+        user_message = DEFAULT_USER_TEMPLATE.format(
+            question=question,
+            sources=formatted_sources,
+        )
+
+        assembled_prompt = f"[SYSTEM]\n{effective_system}\n\n[USER]\n{user_message}"
+
+        url = f"{self.base_url}/chat/completions"
+        payload = {
+            "model": self.model,
+            "messages": [
+                {"role": "system", "content": effective_system},
+                {"role": "user", "content": user_message},
+            ],
+            "temperature": effective_temp,
+            "max_tokens": effective_max_tokens,
+        }
+
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            response = await client.post(
+                url,
+                json=payload,
+                headers={
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json",
+                },
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        content = data["choices"][0]["message"]["content"]
+        logger.info("LLM response: %d chars, model=%s, temp=%.2f", len(content), self.model, effective_temp)
+        return {"text": content.strip(), "assembled_prompt": assembled_prompt}
@@ -0,0 +1,52 @@
+import logging
+
+from services.llm_client import LLMClient
+from services.vectorstore import VectorStoreService
+
+logger = logging.getLogger(__name__)
+
+
+async def rag_query(
+    vectorstore: VectorStoreService,
+    llm_client: LLMClient,
+    question: str,
+    top_k: int = 5,
+    document_ids: list[str] | None = None,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+) -> dict:
+    """Pipeline: retrieve → augment → generate для одиночного вопроса пациента."""
+    logger.info("RAG query: %s", question[:200])
+
+    retrieved = vectorstore.query(
+        query_text=question,
+        top_k=top_k,
+        document_ids=document_ids,
+    )
+    logger.info("Retrieved %d chunks", len(retrieved))
+
+    llm_result = await llm_client.answer(
+        question=question,
+        sources=retrieved,
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+
+    sources = []
+    for item in retrieved:
+        meta = item.get("metadata", {})
+        sources.append({
+            "document_id": meta.get("document_id", ""),
+            "document_name": meta.get("document_name", ""),
+            "chunk_text": item["text"][:500],
+            "section": meta.get("section", ""),
+            "page": meta.get("page_number", 0),
+            "relevance_score": round(item.get("relevance_score", 0), 3),
+        })
+
+    return {
+        "answer": llm_result["text"],
+        "sources": sources,
+        "model_used": llm_client.model,
+        "assembled_prompt": llm_result["assembled_prompt"],
+    }
@@ -0,0 +1,145 @@
+import logging
+from datetime import datetime, timezone
+
+import chromadb
+
+from services.embeddings import EmbeddingService
+
+logger = logging.getLogger(__name__)
+
+COLLECTION_NAME = "operators_wiki"
+
+
+class VectorStoreService:
+    def __init__(self, persist_dir: str, embedding_service: EmbeddingService):
+        self.client = chromadb.PersistentClient(path=persist_dir)
+        self.embedding_service = embedding_service
+        self.collection = self.client.get_or_create_collection(
+            name=COLLECTION_NAME,
+            metadata={"hnsw:space": "cosine"},
+        )
+        logger.info("ChromaDB collection '%s': %d items", COLLECTION_NAME, self.collection.count())
+
+    def add_document(
+        self,
+        document_id: str,
+        document_name: str,
+        file_type: str,
+        chunks: list[dict],
+    ) -> int:
+        if not chunks:
+            return 0
+
+        texts = [c["text"] for c in chunks]
+        embeddings = self.embedding_service.embed_documents(texts)
+
+        ids = []
+        metadatas = []
+        now = datetime.now(timezone.utc).isoformat()
+
+        for i, chunk in enumerate(chunks):
+            ids.append(f"{document_id}_chunk_{i}")
+            metadatas.append({
+                "document_id": document_id,
+                "document_name": document_name,
+                "file_type": file_type,
+                "section": chunk.get("section", ""),
+                "page_number": chunk.get("page_number", 0),
+                "chunk_index": i,
+                "created_at": now,
+            })
+
+        self.collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=metadatas,
+        )
+        logger.info("Added %d chunks for document '%s'", len(chunks), document_name)
+        return len(chunks)
+
+    def query(
+        self,
+        query_text: str,
+        top_k: int = 5,
+        document_ids: list[str] | None = None,
+    ) -> list[dict]:
+        query_embedding = self.embedding_service.embed_query(query_text)
+
+        where_filter = None
+        if document_ids:
+            if len(document_ids) == 1:
+                where_filter = {"document_id": document_ids[0]}
+            else:
+                where_filter = {"document_id": {"$in": document_ids}}
+
+        results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=top_k,
+            where=where_filter,
+            include=["documents", "metadatas", "distances"],
+        )
+
+        items = []
+        if results["ids"] and results["ids"][0]:
+            for i, chunk_id in enumerate(results["ids"][0]):
+                items.append({
+                    "chunk_id": chunk_id,
+                    "text": results["documents"][0][i],
+                    "metadata": results["metadatas"][0][i],
+                    "distance": results["distances"][0][i],
+                    "relevance_score": 1 - results["distances"][0][i],
+                })
+        return items
+
+    def delete_document(self, document_id: str) -> int:
+        existing = self.collection.get(where={"document_id": document_id}, include=[])
+        count = len(existing["ids"])
+        if count > 0:
+            self.collection.delete(ids=existing["ids"])
+        logger.info("Deleted %d chunks for document_id=%s", count, document_id)
+        return count
+
+    def list_documents(self) -> list[dict]:
+        all_items = self.collection.get(include=["metadatas"])
+        docs: dict[str, dict] = {}
+        for meta in all_items["metadatas"]:
+            doc_id = meta["document_id"]
+            if doc_id not in docs:
+                docs[doc_id] = {
+                    "document_id": doc_id,
+                    "name": meta.get("document_name", ""),
+                    "file_type": meta.get("file_type", ""),
+                    "created_at": meta.get("created_at", ""),
+                    "chunks_count": 0,
+                    "metadata": {},
+                }
+            docs[doc_id]["chunks_count"] += 1
+        return list(docs.values())
+
+    def get_document_chunks(self, document_id: str) -> list[dict]:
+        """Return all chunks for a document, sorted by chunk_index."""
+        results = self.collection.get(
+            where={"document_id": document_id},
+            include=["documents", "metadatas"],
+        )
+        items = []
+        if results["ids"]:
+            for i, chunk_id in enumerate(results["ids"]):
+                items.append({
+                    "chunk_id": chunk_id,
+                    "text": results["documents"][i],
+                    "metadata": results["metadatas"][i],
+                })
+        items.sort(key=lambda x: x["metadata"].get("chunk_index", 0))
+        return items
+
+    def get_stats(self) -> dict:
+        all_items = self.collection.get(include=["metadatas"])
+        doc_ids = set()
+        for meta in all_items["metadatas"]:
+            doc_ids.add(meta.get("document_id", ""))
+        return {
+            "documents_count": len(doc_ids),
+            "chunks_count": self.collection.count(),
+        }