feat: Спринт 1 — RAG-ядро, загрузка wiki и Debug UI
FastAPI + ChromaDB + E5-large + DeepSeek по паттерну work-pcs-dr-cdss, адаптированному под пациентский контекст: - services: embeddings (E5-large с префиксами), vectorstore (коллекция operators_wiki), document_processor (PDF/DOCX/TXT/MD + чанкер с FAQ- паттерном под wiki), llm_client (системный промпт ассистента клиники), rag_pipeline (одиночный вопрос → retrieval → ответ). - routers: /health, /documents (upload, list, chunks, delete), /query. - static/index.html: шапка со статусом, блок базы знаний с раскрытием чанков по клику, блок тест-вопроса с 3-колоночным ответом (чанки со score / собранный промпт / ответ LLM). - Порт 8003 (8001 занят CDSS, 8002 — voicenote). E2E проверен: загрузка wiki_test.md → 2 чанка, вопрос «как записать ребёнка к лору?» → top score 84.8%, корректный ответ DeepSeek. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,300 @@
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # pymupdf
|
||||
from docx import Document as DocxDocument
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedSection:
|
||||
heading: str
|
||||
heading_level: int
|
||||
body: str
|
||||
page_number: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
text: str
|
||||
section: str = ""
|
||||
page_number: int = 0
|
||||
chunk_index: int = 0
|
||||
|
||||
|
||||
# --- Parsers ---
|
||||
|
||||
|
||||
def parse_pdf(file_bytes: bytes) -> list[ParsedSection]:
|
||||
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
||||
sections: list[ParsedSection] = []
|
||||
current_heading = ""
|
||||
current_body_lines: list[str] = []
|
||||
current_page = 0
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
blocks = page.get_text("dict")["blocks"]
|
||||
|
||||
for block in blocks:
|
||||
if "lines" not in block:
|
||||
continue
|
||||
for line in block["lines"]:
|
||||
text = "".join(span["text"] for span in line["spans"]).strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
max_size = max(span["size"] for span in line["spans"])
|
||||
is_bold = any("bold" in span["font"].lower() for span in line["spans"])
|
||||
|
||||
if (max_size >= 14 or (is_bold and max_size >= 12)) and len(text) < 200:
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=1 if max_size >= 16 else 2,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
page_number=current_page,
|
||||
))
|
||||
current_heading = text
|
||||
current_body_lines = []
|
||||
current_page = page_num + 1
|
||||
else:
|
||||
current_body_lines.append(text)
|
||||
if not current_heading:
|
||||
current_page = page_num + 1
|
||||
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=2,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
page_number=current_page,
|
||||
))
|
||||
|
||||
doc.close()
|
||||
return sections
|
||||
|
||||
|
||||
def parse_docx(file_bytes: bytes) -> list[ParsedSection]:
|
||||
doc = DocxDocument(io.BytesIO(file_bytes))
|
||||
sections: list[ParsedSection] = []
|
||||
current_heading = ""
|
||||
current_level = 0
|
||||
current_body_lines: list[str] = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
style_name = (para.style.name or "").lower()
|
||||
|
||||
if "heading" in style_name or "title" in style_name:
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=current_level or 1,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
))
|
||||
level_match = re.search(r"\d+", style_name)
|
||||
current_level = int(level_match.group()) if level_match else 1
|
||||
current_heading = text
|
||||
current_body_lines = []
|
||||
else:
|
||||
current_body_lines.append(text)
|
||||
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=current_level or 1,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
))
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def parse_text(file_bytes: bytes, is_markdown: bool = False) -> list[ParsedSection]:
|
||||
"""Parse wiki-style TXT/MD.
|
||||
|
||||
Эвристики под wiki операторов:
|
||||
- markdown-заголовки (#, ##, ...)
|
||||
- нумерованные пункты «1.», «1.1.», «1.1.1.»
|
||||
- FAQ-паттерн «В:» / «Вопрос:» — воспринимаем как начало новой секции
|
||||
- ALL-CAPS строки (короткие) — заголовок
|
||||
"""
|
||||
text = file_bytes.decode("utf-8", errors="replace")
|
||||
lines = text.split("\n")
|
||||
sections: list[ParsedSection] = []
|
||||
current_heading = ""
|
||||
current_level = 0
|
||||
current_body_lines: list[str] = []
|
||||
|
||||
md_heading_re = re.compile(r"^(#{1,6})\s+(.+)")
|
||||
numbered_heading_re = re.compile(r"^(\d+(?:\.\d+)*\.?)\s+([А-ЯЁA-Z].*)")
|
||||
faq_question_re = re.compile(r"^(В|Вопрос|Q|Question)\s*[:\.]\s*(.+)", re.IGNORECASE)
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
heading_text = None
|
||||
heading_level = 0
|
||||
|
||||
md_match = md_heading_re.match(stripped)
|
||||
if md_match:
|
||||
heading_level = len(md_match.group(1))
|
||||
heading_text = md_match.group(2).strip()
|
||||
|
||||
if not heading_text:
|
||||
num_match = numbered_heading_re.match(stripped)
|
||||
if num_match and len(stripped) < 200:
|
||||
dots = num_match.group(1).count(".")
|
||||
heading_level = max(1, dots + 1)
|
||||
heading_text = stripped
|
||||
|
||||
if not heading_text:
|
||||
faq_match = faq_question_re.match(stripped)
|
||||
if faq_match and len(stripped) < 300:
|
||||
heading_text = faq_match.group(2).strip()
|
||||
heading_level = 3
|
||||
|
||||
if not heading_text and stripped.isupper() and 3 < len(stripped) < 200:
|
||||
heading_text = stripped
|
||||
heading_level = 1
|
||||
|
||||
if heading_text:
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=current_level or 1,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
))
|
||||
current_heading = heading_text
|
||||
current_level = heading_level
|
||||
current_body_lines = []
|
||||
else:
|
||||
current_body_lines.append(line)
|
||||
|
||||
if current_heading or current_body_lines:
|
||||
sections.append(ParsedSection(
|
||||
heading=current_heading,
|
||||
heading_level=current_level or 1,
|
||||
body="\n".join(current_body_lines).strip(),
|
||||
))
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
# --- Chunker ---
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
return [s.strip() for s in sentences if s.strip()]
|
||||
|
||||
|
||||
def chunk_sections(
|
||||
sections: list[ParsedSection],
|
||||
max_chunk_size: int | None = None,
|
||||
min_chunk_size: int | None = None,
|
||||
overlap_sentences: int | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""Чанкинг wiki-секций.
|
||||
|
||||
- Малые секции (FAQ-ответы) держим целиком — один чанк = одна тема.
|
||||
- Большие секции (регламенты) режем по абзацам, с overlap последних N предложений.
|
||||
- Мелкие соседние секции склеиваем, чтобы не плодить огрызки.
|
||||
"""
|
||||
max_size = max_chunk_size or settings.max_chunk_size
|
||||
min_size = min_chunk_size or settings.min_chunk_size
|
||||
overlap = overlap_sentences or settings.overlap_sentences
|
||||
|
||||
raw_chunks: list[Chunk] = []
|
||||
|
||||
for section in sections:
|
||||
heading_prefix = f"{section.heading}\n\n" if section.heading else ""
|
||||
full_text = heading_prefix + section.body
|
||||
|
||||
if len(full_text) <= max_size:
|
||||
raw_chunks.append(Chunk(
|
||||
text=full_text.strip(),
|
||||
section=section.heading,
|
||||
page_number=section.page_number,
|
||||
))
|
||||
else:
|
||||
paragraphs = section.body.split("\n")
|
||||
current_text = heading_prefix
|
||||
for para in paragraphs:
|
||||
if len(current_text) + len(para) + 1 > max_size and len(current_text) > len(heading_prefix):
|
||||
raw_chunks.append(Chunk(
|
||||
text=current_text.strip(),
|
||||
section=section.heading,
|
||||
page_number=section.page_number,
|
||||
))
|
||||
current_text = heading_prefix + para + "\n"
|
||||
else:
|
||||
current_text += para + "\n"
|
||||
if current_text.strip() and current_text.strip() != heading_prefix.strip():
|
||||
raw_chunks.append(Chunk(
|
||||
text=current_text.strip(),
|
||||
section=section.heading,
|
||||
page_number=section.page_number,
|
||||
))
|
||||
|
||||
merged: list[Chunk] = []
|
||||
for chunk in raw_chunks:
|
||||
if merged and len(merged[-1].text) < min_size:
|
||||
merged[-1].text += "\n\n" + chunk.text
|
||||
if not merged[-1].section:
|
||||
merged[-1].section = chunk.section
|
||||
else:
|
||||
merged.append(Chunk(
|
||||
text=chunk.text,
|
||||
section=chunk.section,
|
||||
page_number=chunk.page_number,
|
||||
))
|
||||
|
||||
final: list[Chunk] = []
|
||||
for i, chunk in enumerate(merged):
|
||||
if i > 0 and overlap > 0:
|
||||
prev_sentences = _split_sentences(merged[i - 1].text)
|
||||
overlap_text = " ".join(prev_sentences[-overlap:])
|
||||
if overlap_text and overlap_text not in chunk.text:
|
||||
chunk.text = overlap_text + "\n\n" + chunk.text
|
||||
chunk.chunk_index = i
|
||||
final.append(chunk)
|
||||
|
||||
return final
|
||||
|
||||
|
||||
# --- Main processor ---
|
||||
|
||||
|
||||
def process_document(file_bytes: bytes, filename: str) -> tuple[str, list[ParsedSection], list[Chunk]]:
|
||||
document_id = str(uuid.uuid4())
|
||||
ext = Path(filename).suffix.lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
sections = parse_pdf(file_bytes)
|
||||
elif ext in (".docx", ".doc"):
|
||||
sections = parse_docx(file_bytes)
|
||||
elif ext == ".md":
|
||||
sections = parse_text(file_bytes, is_markdown=True)
|
||||
elif ext == ".txt":
|
||||
sections = parse_text(file_bytes, is_markdown=False)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {ext}")
|
||||
|
||||
if not sections:
|
||||
logger.warning("No sections found in %s", filename)
|
||||
return document_id, [], []
|
||||
|
||||
chunks = chunk_sections(sections)
|
||||
logger.info("Processed '%s': %d sections → %d chunks", filename, len(sections), len(chunks))
|
||||
return document_id, sections, chunks
|
||||
@@ -0,0 +1,22 @@
|
||||
import logging
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EmbeddingService:
|
||||
def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
|
||||
logger.info("Loading embedding model: %s", model_name)
|
||||
self.model = SentenceTransformer(model_name)
|
||||
self.model_name = model_name
|
||||
logger.info("Embedding model loaded (dim=%d)", self.model.get_sentence_embedding_dimension())
|
||||
|
||||
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
||||
prefixed = [f"passage: {t}" for t in texts]
|
||||
embeddings = self.model.encode(prefixed, normalize_embeddings=True, show_progress_bar=False)
|
||||
return embeddings.tolist()
|
||||
|
||||
def embed_query(self, query: str) -> list[float]:
|
||||
embedding = self.model.encode(f"query: {query}", normalize_embeddings=True)
|
||||
return embedding.tolist()
|
||||
@@ -0,0 +1,104 @@
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = """Ты — виртуальный ассистент клиники, который первым отвечает пациентам в чате.
|
||||
|
||||
Твоя задача — помочь пациенту по бытовым и организационным вопросам: запись, расписание врачей, подготовка к приёму, как проехать, документы, оплата, ДМС, детский приём и т. п.
|
||||
|
||||
Правила:
|
||||
- Отвечай коротко, дружелюбно, на «вы», простым русским языком без медицинской латыни.
|
||||
- Опирайся ТОЛЬКО на предоставленные выдержки из базы знаний. Если ответа в них нет — честно скажи, что уточнишь у оператора, и предложи подключить оператора.
|
||||
- Не ставь диагнозы и не назначай лечение. Если вопрос про симптомы, лекарства, дозировки или «что со мной» — мягко предложи записаться к врачу и подключить оператора, если нужно.
|
||||
- Не выдумывай телефоны, адреса, цены, имена врачей, расписание. Только из источников.
|
||||
- Если пациент просит оператора — коротко подтверди, что сейчас его подключишь.
|
||||
- Источники указывать не нужно: пациент их не видит. Ответ — обычный текст, как в чате."""
|
||||
|
||||
DEFAULT_USER_TEMPLATE = """Вопрос пациента:
|
||||
{question}
|
||||
|
||||
Выдержки из базы знаний операторов:
|
||||
{sources}
|
||||
|
||||
Ответь пациенту в чате по правилам из системного сообщения."""
|
||||
|
||||
|
||||
class LLMClient:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
model: str | None = None,
|
||||
base_url: str | None = None,
|
||||
):
|
||||
self.api_key = api_key or settings.deepseek_api_key
|
||||
self.model = model or settings.deepseek_model
|
||||
self.base_url = (base_url or settings.deepseek_base_url).rstrip("/")
|
||||
|
||||
def _format_sources(self, sources: list[dict]) -> str:
|
||||
if not sources:
|
||||
return "(источники не найдены)"
|
||||
lines = []
|
||||
for i, src in enumerate(sources, 1):
|
||||
meta = src.get("metadata", {})
|
||||
doc_name = meta.get("document_name", "Документ")
|
||||
section = meta.get("section", "")
|
||||
lines.append(
|
||||
f"[{i}] {src['text']}\n"
|
||||
f" (Источник: {doc_name}, раздел: {section})"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
async def answer(
|
||||
self,
|
||||
question: str,
|
||||
sources: list[dict],
|
||||
system_prompt: str | None = None,
|
||||
temperature: float | None = None,
|
||||
max_tokens: int | None = None,
|
||||
) -> dict:
|
||||
"""Generate a patient-facing answer using RAG context.
|
||||
|
||||
Returns dict with 'text' and 'assembled_prompt'.
|
||||
"""
|
||||
effective_system = system_prompt or DEFAULT_SYSTEM_PROMPT
|
||||
effective_temp = temperature if temperature is not None else 0.2
|
||||
effective_max_tokens = max_tokens or 1200
|
||||
|
||||
formatted_sources = self._format_sources(sources)
|
||||
user_message = DEFAULT_USER_TEMPLATE.format(
|
||||
question=question,
|
||||
sources=formatted_sources,
|
||||
)
|
||||
|
||||
assembled_prompt = f"[SYSTEM]\n{effective_system}\n\n[USER]\n{user_message}"
|
||||
|
||||
url = f"{self.base_url}/chat/completions"
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": effective_system},
|
||||
{"role": "user", "content": user_message},
|
||||
],
|
||||
"temperature": effective_temp,
|
||||
"max_tokens": effective_max_tokens,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
url,
|
||||
json=payload,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
logger.info("LLM response: %d chars, model=%s, temp=%.2f", len(content), self.model, effective_temp)
|
||||
return {"text": content.strip(), "assembled_prompt": assembled_prompt}
|
||||
@@ -0,0 +1,52 @@
|
||||
import logging
|
||||
|
||||
from services.llm_client import LLMClient
|
||||
from services.vectorstore import VectorStoreService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def rag_query(
|
||||
vectorstore: VectorStoreService,
|
||||
llm_client: LLMClient,
|
||||
question: str,
|
||||
top_k: int = 5,
|
||||
document_ids: list[str] | None = None,
|
||||
temperature: float | None = None,
|
||||
max_tokens: int | None = None,
|
||||
) -> dict:
|
||||
"""Pipeline: retrieve → augment → generate для одиночного вопроса пациента."""
|
||||
logger.info("RAG query: %s", question[:200])
|
||||
|
||||
retrieved = vectorstore.query(
|
||||
query_text=question,
|
||||
top_k=top_k,
|
||||
document_ids=document_ids,
|
||||
)
|
||||
logger.info("Retrieved %d chunks", len(retrieved))
|
||||
|
||||
llm_result = await llm_client.answer(
|
||||
question=question,
|
||||
sources=retrieved,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
sources = []
|
||||
for item in retrieved:
|
||||
meta = item.get("metadata", {})
|
||||
sources.append({
|
||||
"document_id": meta.get("document_id", ""),
|
||||
"document_name": meta.get("document_name", ""),
|
||||
"chunk_text": item["text"][:500],
|
||||
"section": meta.get("section", ""),
|
||||
"page": meta.get("page_number", 0),
|
||||
"relevance_score": round(item.get("relevance_score", 0), 3),
|
||||
})
|
||||
|
||||
return {
|
||||
"answer": llm_result["text"],
|
||||
"sources": sources,
|
||||
"model_used": llm_client.model,
|
||||
"assembled_prompt": llm_result["assembled_prompt"],
|
||||
}
|
||||
@@ -0,0 +1,145 @@
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import chromadb
|
||||
|
||||
from services.embeddings import EmbeddingService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
COLLECTION_NAME = "operators_wiki"
|
||||
|
||||
|
||||
class VectorStoreService:
|
||||
def __init__(self, persist_dir: str, embedding_service: EmbeddingService):
|
||||
self.client = chromadb.PersistentClient(path=persist_dir)
|
||||
self.embedding_service = embedding_service
|
||||
self.collection = self.client.get_or_create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
logger.info("ChromaDB collection '%s': %d items", COLLECTION_NAME, self.collection.count())
|
||||
|
||||
def add_document(
|
||||
self,
|
||||
document_id: str,
|
||||
document_name: str,
|
||||
file_type: str,
|
||||
chunks: list[dict],
|
||||
) -> int:
|
||||
if not chunks:
|
||||
return 0
|
||||
|
||||
texts = [c["text"] for c in chunks]
|
||||
embeddings = self.embedding_service.embed_documents(texts)
|
||||
|
||||
ids = []
|
||||
metadatas = []
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
ids.append(f"{document_id}_chunk_{i}")
|
||||
metadatas.append({
|
||||
"document_id": document_id,
|
||||
"document_name": document_name,
|
||||
"file_type": file_type,
|
||||
"section": chunk.get("section", ""),
|
||||
"page_number": chunk.get("page_number", 0),
|
||||
"chunk_index": i,
|
||||
"created_at": now,
|
||||
})
|
||||
|
||||
self.collection.add(
|
||||
ids=ids,
|
||||
embeddings=embeddings,
|
||||
documents=texts,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
logger.info("Added %d chunks for document '%s'", len(chunks), document_name)
|
||||
return len(chunks)
|
||||
|
||||
def query(
|
||||
self,
|
||||
query_text: str,
|
||||
top_k: int = 5,
|
||||
document_ids: list[str] | None = None,
|
||||
) -> list[dict]:
|
||||
query_embedding = self.embedding_service.embed_query(query_text)
|
||||
|
||||
where_filter = None
|
||||
if document_ids:
|
||||
if len(document_ids) == 1:
|
||||
where_filter = {"document_id": document_ids[0]}
|
||||
else:
|
||||
where_filter = {"document_id": {"$in": document_ids}}
|
||||
|
||||
results = self.collection.query(
|
||||
query_embeddings=[query_embedding],
|
||||
n_results=top_k,
|
||||
where=where_filter,
|
||||
include=["documents", "metadatas", "distances"],
|
||||
)
|
||||
|
||||
items = []
|
||||
if results["ids"] and results["ids"][0]:
|
||||
for i, chunk_id in enumerate(results["ids"][0]):
|
||||
items.append({
|
||||
"chunk_id": chunk_id,
|
||||
"text": results["documents"][0][i],
|
||||
"metadata": results["metadatas"][0][i],
|
||||
"distance": results["distances"][0][i],
|
||||
"relevance_score": 1 - results["distances"][0][i],
|
||||
})
|
||||
return items
|
||||
|
||||
def delete_document(self, document_id: str) -> int:
|
||||
existing = self.collection.get(where={"document_id": document_id}, include=[])
|
||||
count = len(existing["ids"])
|
||||
if count > 0:
|
||||
self.collection.delete(ids=existing["ids"])
|
||||
logger.info("Deleted %d chunks for document_id=%s", count, document_id)
|
||||
return count
|
||||
|
||||
def list_documents(self) -> list[dict]:
|
||||
all_items = self.collection.get(include=["metadatas"])
|
||||
docs: dict[str, dict] = {}
|
||||
for meta in all_items["metadatas"]:
|
||||
doc_id = meta["document_id"]
|
||||
if doc_id not in docs:
|
||||
docs[doc_id] = {
|
||||
"document_id": doc_id,
|
||||
"name": meta.get("document_name", ""),
|
||||
"file_type": meta.get("file_type", ""),
|
||||
"created_at": meta.get("created_at", ""),
|
||||
"chunks_count": 0,
|
||||
"metadata": {},
|
||||
}
|
||||
docs[doc_id]["chunks_count"] += 1
|
||||
return list(docs.values())
|
||||
|
||||
def get_document_chunks(self, document_id: str) -> list[dict]:
|
||||
"""Return all chunks for a document, sorted by chunk_index."""
|
||||
results = self.collection.get(
|
||||
where={"document_id": document_id},
|
||||
include=["documents", "metadatas"],
|
||||
)
|
||||
items = []
|
||||
if results["ids"]:
|
||||
for i, chunk_id in enumerate(results["ids"]):
|
||||
items.append({
|
||||
"chunk_id": chunk_id,
|
||||
"text": results["documents"][i],
|
||||
"metadata": results["metadatas"][i],
|
||||
})
|
||||
items.sort(key=lambda x: x["metadata"].get("chunk_index", 0))
|
||||
return items
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
all_items = self.collection.get(include=["metadatas"])
|
||||
doc_ids = set()
|
||||
for meta in all_items["metadatas"]:
|
||||
doc_ids.add(meta.get("document_id", ""))
|
||||
return {
|
||||
"documents_count": len(doc_ids),
|
||||
"chunks_count": self.collection.count(),
|
||||
}
|
||||
Reference in New Issue
Block a user