feat: Add text processing background worker for telling client about progress
This commit is contained in:
@@ -2,8 +2,11 @@
|
||||
|
||||
import io
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from typing import Any, Optional
|
||||
|
||||
import anyio
|
||||
import httpx
|
||||
|
||||
from .base import DocumentProcessor, ProcessingResult, ProcessorError
|
||||
@@ -47,6 +50,7 @@ class UnstructuredProcessor(DocumentProcessor):
|
||||
timeout: int = 120,
|
||||
default_strategy: str = "auto",
|
||||
default_languages: Optional[list[str]] = None,
|
||||
progress_interval: int = 10,
|
||||
):
|
||||
"""Initialize Unstructured processor.
|
||||
|
||||
@@ -55,15 +59,18 @@ class UnstructuredProcessor(DocumentProcessor):
|
||||
timeout: Request timeout in seconds (default: 120)
|
||||
default_strategy: Default parsing strategy - "auto", "fast", or "hi_res"
|
||||
default_languages: Default OCR language codes (e.g., ["eng", "deu"])
|
||||
progress_interval: Seconds between progress updates (default: 10)
|
||||
"""
|
||||
self.api_url = api_url
|
||||
self.timeout = timeout
|
||||
self.default_strategy = default_strategy
|
||||
self.default_languages = default_languages or ["eng"]
|
||||
self.progress_interval = progress_interval
|
||||
|
||||
logger.info(
|
||||
f"Initialized UnstructuredProcessor: {api_url}, "
|
||||
f"strategy={default_strategy}, languages={self.default_languages}"
|
||||
f"strategy={default_strategy}, languages={self.default_languages}, "
|
||||
f"progress_interval={progress_interval}s"
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -74,23 +81,65 @@ class UnstructuredProcessor(DocumentProcessor):
|
||||
def supported_mime_types(self) -> set[str]:
|
||||
return self.SUPPORTED_TYPES
|
||||
|
||||
async def process(
|
||||
async def _run_progress_poller(
|
||||
self,
|
||||
stop_event: anyio.Event,
|
||||
progress_callback: Callable[
|
||||
[float, Optional[float], Optional[str]], Awaitable[None]
|
||||
],
|
||||
start_time: float,
|
||||
):
|
||||
"""Run progress poller that reports status every N seconds.
|
||||
|
||||
Args:
|
||||
stop_event: Event to signal when processing is complete
|
||||
progress_callback: Async callback to report progress
|
||||
start_time: Time when processing started (from time.time())
|
||||
"""
|
||||
logger.debug("Starting progress poller")
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
# Wait for the event to be set, with a timeout equal to progress_interval
|
||||
with anyio.fail_after(self.progress_interval):
|
||||
await stop_event.wait()
|
||||
# If wait() finished, the event was set (processing complete)
|
||||
break
|
||||
except TimeoutError:
|
||||
# Timeout occurred - time to send a progress update
|
||||
if not stop_event.is_set(): # Double-check in case of race condition
|
||||
elapsed = int(time.time() - start_time)
|
||||
message = (
|
||||
f"Processing document with unstructured... ({elapsed}s elapsed)"
|
||||
)
|
||||
try:
|
||||
await progress_callback(
|
||||
progress=float(elapsed),
|
||||
total=None, # Unknown total duration
|
||||
message=message,
|
||||
)
|
||||
logger.debug(f"Progress update sent: {elapsed}s elapsed")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to send progress update: {e}")
|
||||
logger.debug("Progress poller stopped")
|
||||
|
||||
async def _make_api_request(
|
||||
self,
|
||||
content: bytes,
|
||||
content_type: str,
|
||||
filename: Optional[str] = None,
|
||||
options: Optional[dict[str, Any]] = None,
|
||||
filename: Optional[str],
|
||||
strategy: str,
|
||||
languages: list[str],
|
||||
extract_image_block_types: Optional[list[str]],
|
||||
) -> ProcessingResult:
|
||||
"""Process document via Unstructured API.
|
||||
"""Make the actual API request to Unstructured.
|
||||
|
||||
Args:
|
||||
content: Document bytes
|
||||
content_type: MIME type
|
||||
filename: Optional filename for format detection
|
||||
options: Processing options:
|
||||
- strategy: "auto", "fast", or "hi_res" (default: from init)
|
||||
- languages: List of language codes (default: from init)
|
||||
- extract_image_block_types: Types of image elements to extract
|
||||
filename: Optional filename
|
||||
strategy: Processing strategy
|
||||
languages: OCR languages
|
||||
extract_image_block_types: Image element types to extract
|
||||
|
||||
Returns:
|
||||
ProcessingResult with extracted text and metadata
|
||||
@@ -98,13 +147,6 @@ class UnstructuredProcessor(DocumentProcessor):
|
||||
Raises:
|
||||
ProcessorError: If processing fails
|
||||
"""
|
||||
options = options or {}
|
||||
|
||||
# Extract options with defaults
|
||||
strategy = options.get("strategy", self.default_strategy)
|
||||
languages = options.get("languages", self.default_languages)
|
||||
extract_image_block_types = options.get("extract_image_block_types")
|
||||
|
||||
# Prepare multipart request
|
||||
files = {
|
||||
"files": (
|
||||
@@ -178,6 +220,81 @@ class UnstructuredProcessor(DocumentProcessor):
|
||||
logger.error(f"Unstructured API processing failed: {e}")
|
||||
raise ProcessorError(f"Processing failed: {str(e)}") from e
|
||||
|
||||
async def process(
|
||||
self,
|
||||
content: bytes,
|
||||
content_type: str,
|
||||
filename: Optional[str] = None,
|
||||
options: Optional[dict[str, Any]] = None,
|
||||
progress_callback: Optional[
|
||||
Callable[[float, Optional[float], Optional[str]], Awaitable[None]]
|
||||
] = None,
|
||||
) -> ProcessingResult:
|
||||
"""Process document via Unstructured API.
|
||||
|
||||
Args:
|
||||
content: Document bytes
|
||||
content_type: MIME type
|
||||
filename: Optional filename for format detection
|
||||
options: Processing options:
|
||||
- strategy: "auto", "fast", or "hi_res" (default: from init)
|
||||
- languages: List of language codes (default: from init)
|
||||
- extract_image_block_types: Types of image elements to extract
|
||||
progress_callback: Optional async callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingResult with extracted text and metadata
|
||||
|
||||
Raises:
|
||||
ProcessorError: If processing fails
|
||||
"""
|
||||
options = options or {}
|
||||
|
||||
# Extract options with defaults
|
||||
strategy = options.get("strategy", self.default_strategy)
|
||||
languages = options.get("languages", self.default_languages)
|
||||
extract_image_block_types = options.get("extract_image_block_types")
|
||||
|
||||
# If no progress callback, just make the request directly
|
||||
if progress_callback is None:
|
||||
return await self._make_api_request(
|
||||
content=content,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
)
|
||||
|
||||
# With progress callback: run API request + progress poller concurrently
|
||||
stop_event = anyio.Event()
|
||||
start_time = time.time()
|
||||
result = None
|
||||
|
||||
async def capture_result():
|
||||
nonlocal result
|
||||
try:
|
||||
result = await self._make_api_request(
|
||||
content=content,
|
||||
content_type=content_type,
|
||||
filename=filename,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
)
|
||||
finally:
|
||||
# Signal poller to stop after API request completes
|
||||
stop_event.set()
|
||||
|
||||
# Run both tasks concurrently using anyio task groups
|
||||
async with anyio.create_task_group() as tg:
|
||||
tg.start_soon(capture_result)
|
||||
tg.start_soon(
|
||||
self._run_progress_poller, stop_event, progress_callback, start_time
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Check if Unstructured API is available.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user