You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
2.8 KiB
89 lines
2.8 KiB
"""Извлечение текста из PDF/DOCX/TXT/MD (порт `services/documentExtractService.js`).""" |
|
from __future__ import annotations |
|
|
|
from io import BytesIO |
|
from typing import Optional |
|
|
|
|
|
class HttpError(Exception): |
|
def __init__(self, status: int, message: str): |
|
super().__init__(message) |
|
self.status = status |
|
self.message = message |
|
|
|
|
|
SUPPORTED_MIME = { |
|
'application/pdf': 'pdf', |
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', |
|
'text/plain': 'text', |
|
'text/markdown': 'text', |
|
} |
|
SUPPORTED_EXT = { |
|
'.pdf': 'pdf', |
|
'.docx': 'docx', |
|
'.txt': 'text', |
|
'.md': 'text', |
|
} |
|
|
|
|
|
def resolve_document_kind(mimetype: str | None, original_name: str | None = '') -> Optional[str]: |
|
m = (mimetype or '').lower() |
|
n = (original_name or '').lower() |
|
if m in SUPPORTED_MIME: |
|
return SUPPORTED_MIME[m] |
|
for ext, kind in SUPPORTED_EXT.items(): |
|
if n.endswith(ext): |
|
return kind |
|
return None |
|
|
|
|
|
def extract_text_from_buffer(kind: str, buf: bytes) -> str: |
|
if kind == 'text': |
|
try: |
|
return buf.decode('utf-8') |
|
except UnicodeDecodeError: |
|
return buf.decode('utf-8', errors='replace') |
|
|
|
if kind == 'docx': |
|
try: |
|
from docx import Document |
|
except ImportError: |
|
raise HttpError(500, 'python-docx не установлен (см. requirements.txt).') |
|
doc = Document(BytesIO(buf)) |
|
parts = [] |
|
for p in doc.paragraphs: |
|
if p.text: |
|
parts.append(p.text) |
|
for table in doc.tables: |
|
for row in table.rows: |
|
for cell in row.cells: |
|
if cell.text: |
|
parts.append(cell.text) |
|
return '\n'.join(parts).replace('\r\n', '\n').strip() |
|
|
|
if kind == 'pdf': |
|
try: |
|
from pypdf import PdfReader |
|
except ImportError: |
|
raise HttpError(500, 'pypdf не установлен (см. requirements.txt).') |
|
reader = PdfReader(BytesIO(buf)) |
|
parts = [] |
|
for page in reader.pages: |
|
try: |
|
t = page.extract_text() or '' |
|
except Exception: |
|
t = '' |
|
if t: |
|
parts.append(t) |
|
return '\n'.join(parts).replace('\r\n', '\n').strip() |
|
|
|
return '' |
|
|
|
|
|
def extract_text_from_file(mimetype: str | None, file_storage, original_name: str | None) -> str: |
|
"""`file_storage` — werkzeug FileStorage. Читает целиком в память (≤16 МБ).""" |
|
kind = resolve_document_kind(mimetype, original_name) |
|
if not kind: |
|
raise HttpError(400, 'Неподдерживаемый формат. Допустимы: PDF, DOCX, TXT, MD.') |
|
buf = file_storage.read() |
|
return extract_text_from_buffer(kind, buf)
|
|
|