TestingWebApp/flask_app/app/services/document_extract.py

"""Извлечение текста из PDF/DOCX/TXT/MD (порт `services/documentExtractService.js`)."""
from __future__ import annotations

from io import BytesIO
from typing import Optional


class HttpError(Exception):
    def __init__(self, status: int, message: str):
        super().__init__(message)
        self.status = status
        self.message = message


SUPPORTED_MIME = {
    'application/pdf': 'pdf',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
    'text/plain': 'text',
    'text/markdown': 'text',
}
SUPPORTED_EXT = {
    '.pdf': 'pdf',
    '.docx': 'docx',
    '.txt': 'text',
    '.md': 'text',
}


def resolve_document_kind(mimetype: str | None, original_name: str | None = '') -> Optional[str]:
    # Браузеры часто шлют «text/plain; charset=utf-8» — без отсечения параметров ключ не совпадает.
    raw = (mimetype or '').strip()
    m = raw.split(';', 1)[0].strip().lower()
    n = (original_name or '').lower()
    if m in SUPPORTED_MIME:
        return SUPPORTED_MIME[m]
    for ext, kind in SUPPORTED_EXT.items():
        if n.endswith(ext):
            return kind
    return None


def extract_text_from_buffer(kind: str, buf: bytes) -> str:
    if kind == 'text':
        try:
            return buf.decode('utf-8')
        except UnicodeDecodeError:
            return buf.decode('utf-8', errors='replace')

    if kind == 'docx':
        try:
            from docx import Document
        except ImportError:
            raise HttpError(500, 'python-docx не установлен (см. requirements.txt).')
        doc = Document(BytesIO(buf))
        parts = []
        for p in doc.paragraphs:
            if p.text:
                parts.append(p.text)
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text:
                        parts.append(cell.text)
        return '\n'.join(parts).replace('\r\n', '\n').strip()

    if kind == 'pdf':
        try:
            from pypdf import PdfReader
        except ImportError:
            raise HttpError(500, 'pypdf не установлен (см. requirements.txt).')
        reader = PdfReader(BytesIO(buf))
        parts = []
        for page in reader.pages:
            try:
                t = page.extract_text() or ''
            except Exception:
                t = ''
            if t:
                parts.append(t)
        return '\n'.join(parts).replace('\r\n', '\n').strip()

    return ''


def extract_text_from_file(mimetype: str | None, file_storage, original_name: str | None) -> str:
    """`file_storage` — werkzeug FileStorage. Читает целиком в память (≤16 МБ)."""
    kind = resolve_document_kind(mimetype, original_name)
    if not kind:
        raise HttpError(400, 'Неподдерживаемый формат. Допустимы: PDF, DOCX, TXT, MD.')
    buf = file_storage.read()
    return extract_text_from_buffer(kind, buf)