"""Извлечение текста из PDF/DOCX/TXT/MD (порт `services/documentExtractService.js`).""" from __future__ import annotations from io import BytesIO from typing import Optional class HttpError(Exception): def __init__(self, status: int, message: str): super().__init__(message) self.status = status self.message = message SUPPORTED_MIME = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'text/plain': 'text', 'text/markdown': 'text', } SUPPORTED_EXT = { '.pdf': 'pdf', '.docx': 'docx', '.txt': 'text', '.md': 'text', } def resolve_document_kind(mimetype: str | None, original_name: str | None = '') -> Optional[str]: m = (mimetype or '').lower() n = (original_name or '').lower() if m in SUPPORTED_MIME: return SUPPORTED_MIME[m] for ext, kind in SUPPORTED_EXT.items(): if n.endswith(ext): return kind return None def extract_text_from_buffer(kind: str, buf: bytes) -> str: if kind == 'text': try: return buf.decode('utf-8') except UnicodeDecodeError: return buf.decode('utf-8', errors='replace') if kind == 'docx': try: from docx import Document except ImportError: raise HttpError(500, 'python-docx не установлен (см. requirements.txt).') doc = Document(BytesIO(buf)) parts = [] for p in doc.paragraphs: if p.text: parts.append(p.text) for table in doc.tables: for row in table.rows: for cell in row.cells: if cell.text: parts.append(cell.text) return '\n'.join(parts).replace('\r\n', '\n').strip() if kind == 'pdf': try: from pypdf import PdfReader except ImportError: raise HttpError(500, 'pypdf не установлен (см. requirements.txt).') reader = PdfReader(BytesIO(buf)) parts = [] for page in reader.pages: try: t = page.extract_text() or '' except Exception: t = '' if t: parts.append(t) return '\n'.join(parts).replace('\r\n', '\n').strip() return '' def extract_text_from_file(mimetype: str | None, file_storage, original_name: str | None) -> str: """`file_storage` — werkzeug FileStorage. Читает целиком в память (≤16 МБ).""" kind = resolve_document_kind(mimetype, original_name) if not kind: raise HttpError(400, 'Неподдерживаемый формат. Допустимы: PDF, DOCX, TXT, MD.') buf = file_storage.read() return extract_text_from_buffer(kind, buf)