Files
2025-11-05 15:19:55 +01:00

166 lines
4.9 KiB
Python

"""Document processor using Tesseract OCR (local)."""
import logging
import shutil
from collections.abc import Awaitable, Callable
from typing import Any, Optional
from .base import DocumentProcessor, ProcessingResult, ProcessorError
logger = logging.getLogger(__name__)
try:
import io
import pytesseract # type: ignore
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
class TesseractProcessor(DocumentProcessor):
"""Document processor using Tesseract OCR (local).
This processor runs OCR locally using the Tesseract engine, which is
faster and more lightweight than cloud-based solutions but requires
Tesseract to be installed on the system.
Requirements:
- tesseract binary installed (e.g., apt install tesseract-ocr)
- Python packages: pip install pytesseract pillow
Example:
processor = TesseractProcessor(default_lang="eng+deu")
result = await processor.process(image_bytes, "image/jpeg")
"""
SUPPORTED_TYPES = {
"image/jpeg",
"image/png",
"image/tiff",
"image/bmp",
"image/gif",
}
def __init__(
self,
tesseract_cmd: Optional[str] = None,
default_lang: str = "eng",
):
"""Initialize Tesseract processor.
Args:
tesseract_cmd: Path to tesseract executable (None = auto-detect)
default_lang: Default OCR language (e.g., "eng", "deu", "eng+deu")
Raises:
ProcessorError: If Tesseract or required packages not available
"""
if not TESSERACT_AVAILABLE:
raise ProcessorError(
"Tesseract processor requires: pip install pytesseract pillow"
)
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
elif not shutil.which("tesseract"):
raise ProcessorError(
"Tesseract not found in PATH. Install with: apt install tesseract-ocr"
)
self.default_lang = default_lang
logger.info(f"Initialized TesseractProcessor: lang={default_lang}")
@property
def name(self) -> str:
return "tesseract"
@property
def supported_mime_types(self) -> set[str]:
return self.SUPPORTED_TYPES
async def process(
self,
content: bytes,
content_type: str,
filename: Optional[str] = None,
options: Optional[dict[str, Any]] = None,
progress_callback: Optional[
Callable[[float, Optional[float], Optional[str]], Awaitable[None]]
] = None,
) -> ProcessingResult:
"""Process image via Tesseract OCR.
Args:
content: Image bytes
content_type: Image MIME type
filename: Optional filename
options: Processing options:
- lang: OCR language(s) (default: from init)
- config: Tesseract config string
Returns:
ProcessingResult with extracted text and metadata
Raises:
ProcessorError: If OCR fails
"""
options = options or {}
lang = options.get("lang", self.default_lang)
config = options.get("config", "")
try:
# Load image
image = Image.open(io.BytesIO(content))
# Run OCR
text = pytesseract.image_to_string(image, lang=lang, config=config)
# Get additional data for confidence scores
data = pytesseract.image_to_data(
image, lang=lang, output_type=pytesseract.Output.DICT
)
# Calculate average confidence
confidences = [c for c in data["conf"] if c != -1]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
metadata = {
"text_length": len(text),
"language": lang,
"image_size": image.size,
"image_mode": image.mode,
"confidence": round(avg_confidence, 2),
"words_detected": len([c for c in data["conf"] if c != -1]),
}
logger.debug(
f"Tesseract OCR completed: {len(text)} chars, "
f"confidence={avg_confidence:.1f}%"
)
return ProcessingResult(
text=text.strip(),
metadata=metadata,
processor=self.name,
success=True,
)
except Exception as e:
logger.error(f"Tesseract processing failed: {e}")
raise ProcessorError(f"OCR failed: {str(e)}") from e
async def health_check(self) -> bool:
"""Check if Tesseract is available.
Returns:
True if Tesseract is installed and working
"""
try:
pytesseract.get_tesseract_version()
return True
except Exception:
return False