You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.0 KiB

"""Извлечение текста из PDF/DOCX/TXT/MD (порт `services/documentExtractService.js`)."""
from __future__ import annotations
from io import BytesIO
from typing import Optional
class HttpError(Exception):
def __init__(self, status: int, message: str):
super().__init__(message)
self.status = status
self.message = message
SUPPORTED_MIME = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'text/plain': 'text',
'text/markdown': 'text',
}
SUPPORTED_EXT = {
'.pdf': 'pdf',
'.docx': 'docx',
'.txt': 'text',
'.md': 'text',
}
def resolve_document_kind(mimetype: str | None, original_name: str | None = '') -> Optional[str]:
# Браузеры часто шлют «text/plain; charset=utf-8» — без отсечения параметров ключ не совпадает.
raw = (mimetype or '').strip()
m = raw.split(';', 1)[0].strip().lower()
n = (original_name or '').lower()
if m in SUPPORTED_MIME:
return SUPPORTED_MIME[m]
for ext, kind in SUPPORTED_EXT.items():
if n.endswith(ext):
return kind
return None
def extract_text_from_buffer(kind: str, buf: bytes) -> str:
if kind == 'text':
try:
return buf.decode('utf-8')
except UnicodeDecodeError:
return buf.decode('utf-8', errors='replace')
if kind == 'docx':
try:
from docx import Document
except ImportError:
raise HttpError(500, 'python-docx не установлен (см. requirements.txt).')
doc = Document(BytesIO(buf))
parts = []
for p in doc.paragraphs:
if p.text:
parts.append(p.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text:
parts.append(cell.text)
return '\n'.join(parts).replace('\r\n', '\n').strip()
if kind == 'pdf':
try:
from pypdf import PdfReader
except ImportError:
raise HttpError(500, 'pypdf не установлен (см. requirements.txt).')
reader = PdfReader(BytesIO(buf))
parts = []
for page in reader.pages:
try:
t = page.extract_text() or ''
except Exception:
t = ''
if t:
parts.append(t)
return '\n'.join(parts).replace('\r\n', '\n').strip()
return ''
def extract_text_from_file(mimetype: str | None, file_storage, original_name: str | None) -> str:
"""`file_storage` — werkzeug FileStorage. Читает целиком в память (≤16 МБ)."""
kind = resolve_document_kind(mimetype, original_name)
if not kind:
raise HttpError(400, 'Неподдерживаемый формат. Допустимы: PDF, DOCX, TXT, MD.')
buf = file_storage.read()
return extract_text_from_buffer(kind, buf)