refactor: Transform document parsing into pluggable processor architecture
Refactors PR #190's hardcoded Unstructured.io integration into a flexible, extensible plugin system supporting multiple text extraction engines. - **`DocumentProcessor` ABC**: Abstract interface for all processors - **`ProcessorRegistry`**: Central registry for discovery and routing - **`ProcessingResult`**: Standardized output format across processors - **`UnstructuredProcessor`**: Refactored from `UnstructuredClient` - **`TesseractProcessor`**: Local OCR for images (lightweight alternative) - **`CustomHTTPProcessor`**: Generic wrapper for custom HTTP APIs - New `get_document_processor_config()` returns structured config - Supports enabling/disabling individual processors - Per-processor configuration via environment variables - **Breaking Change**: `ENABLE_UNSTRUCTURED_PARSING` replaced with: - `ENABLE_DOCUMENT_PROCESSING=true/false` (master switch) - `ENABLE_UNSTRUCTURED=true/false` (per-processor) - `ENABLE_TESSERACT=true/false` - `ENABLE_CUSTOM_PROCESSOR=true/false` - `parse_document()` now uses `ProcessorRegistry` - Auto-selects appropriate processor based on MIME type - Processor priority system (Unstructured=10, Tesseract=5, Custom=1) - `initialize_document_processors()` registers processors at startup - Integrated into both BasicAuth and OAuth lifespans - Graceful degradation if processors fail to initialize ```env ENABLE_DOCUMENT_PROCESSING=false ENABLE_UNSTRUCTURED=false UNSTRUCTURED_API_URL=http://unstructured:8000 UNSTRUCTURED_STRATEGY=auto # auto|fast|hi_res UNSTRUCTURED_LANGUAGES=eng,deu ENABLE_TESSERACT=false TESSERACT_LANG=eng ENABLE_CUSTOM_PROCESSOR=false CUSTOM_PROCESSOR_URL=http://localhost:9000/process CUSTOM_PROCESSOR_TYPES=application/pdf,image/jpeg ``` - **Removed**: `tests/test_unstructured_config.py` (legacy tests) - **Added**: `tests/unit/test_document_processor_config.py` - 7 unit tests for new config system - Tests individual and multi-processor configurations - **Added**: - `nextcloud_mcp_server/document_processors/__init__.py` - `nextcloud_mcp_server/document_processors/base.py` - `nextcloud_mcp_server/document_processors/registry.py` - `nextcloud_mcp_server/document_processors/unstructured.py` - `nextcloud_mcp_server/document_processors/tesseract.py` - `nextcloud_mcp_server/document_processors/custom_http.py` - `tests/unit/test_document_processor_config.py` - **Modified**: - `nextcloud_mcp_server/config.py` - New plugin config system - `nextcloud_mcp_server/app.py` - Processor initialization - `nextcloud_mcp_server/utils/document_parser.py` - Uses registry - `nextcloud_mcp_server/server/webdav.py` - Import updates - `env.sample` - New configuration format - `docker-compose.yml` - (profile changes from previous work) - **Removed**: - `nextcloud_mcp_server/client/unstructured_client.py` - Replaced by UnstructuredProcessor - `tests/test_unstructured_config.py` - Replaced with new tests ✅ **Extensible**: Add processors without modifying core code ✅ **Testable**: Mock processors for unit tests ✅ **Configurable**: Enable only needed processors ✅ **Flexible**: Choose fast (Tesseract) vs accurate (Unstructured) ✅ **Opt-in**: Disabled by default, no mandatory dependencies Users upgrading from PR #190 need to update environment variables: ```bash ENABLE_UNSTRUCTURED_PARSING=true ENABLE_DOCUMENT_PROCESSING=true ENABLE_UNSTRUCTURED=true ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,164 @@
|
||||
"""Central registry for document processors."""
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
from .base import DocumentProcessor, ProcessingResult, ProcessorError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProcessorRegistry:
|
||||
"""Central registry for document processors.
|
||||
|
||||
Manages registration and routing of document processing requests to
|
||||
appropriate processors based on MIME types and priorities.
|
||||
|
||||
Example:
|
||||
registry = ProcessorRegistry()
|
||||
registry.register(UnstructuredProcessor(...), priority=10)
|
||||
registry.register(TesseractProcessor(...), priority=5)
|
||||
|
||||
# Auto-select processor based on MIME type
|
||||
result = await registry.process(pdf_bytes, "application/pdf")
|
||||
|
||||
# Force specific processor
|
||||
result = await registry.process(img_bytes, "image/png", processor_name="tesseract")
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._processors: dict[str, tuple[DocumentProcessor, int]] = {}
|
||||
self._priority_order: list[str] = []
|
||||
|
||||
def register(self, processor: DocumentProcessor, priority: int = 0):
|
||||
"""Register a document processor.
|
||||
|
||||
Args:
|
||||
processor: Processor instance to register
|
||||
priority: Higher priority processors are tried first (default: 0)
|
||||
"""
|
||||
name = processor.name
|
||||
|
||||
if name in self._processors:
|
||||
logger.warning(f"Processor '{name}' already registered, replacing")
|
||||
|
||||
self._processors[name] = (processor, priority)
|
||||
|
||||
# Update priority order
|
||||
if name in self._priority_order:
|
||||
self._priority_order.remove(name)
|
||||
|
||||
# Insert in priority order (higher priority first)
|
||||
inserted = False
|
||||
for i, existing_name in enumerate(self._priority_order):
|
||||
existing_priority = self._processors[existing_name][1]
|
||||
if priority > existing_priority:
|
||||
self._priority_order.insert(i, name)
|
||||
inserted = True
|
||||
break
|
||||
|
||||
if not inserted:
|
||||
self._priority_order.append(name)
|
||||
|
||||
logger.info(
|
||||
f"Registered processor: {name} "
|
||||
f"(priority={priority}, supports={len(processor.supported_mime_types)} types)"
|
||||
)
|
||||
|
||||
def get_processor(self, name: str) -> Optional[DocumentProcessor]:
|
||||
"""Get a processor by name.
|
||||
|
||||
Args:
|
||||
name: Processor name
|
||||
|
||||
Returns:
|
||||
DocumentProcessor instance or None if not found
|
||||
"""
|
||||
if name in self._processors:
|
||||
return self._processors[name][0]
|
||||
return None
|
||||
|
||||
def find_processor(self, content_type: str) -> Optional[DocumentProcessor]:
|
||||
"""Find the first processor that supports the given MIME type.
|
||||
|
||||
Processors are checked in priority order (highest priority first).
|
||||
|
||||
Args:
|
||||
content_type: MIME type to match
|
||||
|
||||
Returns:
|
||||
First matching processor or None
|
||||
"""
|
||||
for name in self._priority_order:
|
||||
processor = self._processors[name][0]
|
||||
if processor.supports(content_type):
|
||||
logger.debug(f"Found processor '{name}' for type '{content_type}'")
|
||||
return processor
|
||||
|
||||
logger.debug(f"No processor found for type '{content_type}'")
|
||||
return None
|
||||
|
||||
def list_processors(self) -> list[str]:
|
||||
"""List all registered processor names in priority order.
|
||||
|
||||
Returns:
|
||||
List of processor names (highest priority first)
|
||||
"""
|
||||
return list(self._priority_order)
|
||||
|
||||
async def process(
|
||||
self,
|
||||
content: bytes,
|
||||
content_type: str,
|
||||
filename: Optional[str] = None,
|
||||
processor_name: Optional[str] = None,
|
||||
options: Optional[dict[str, Any]] = None,
|
||||
) -> ProcessingResult:
|
||||
"""Process a document using available processors.
|
||||
|
||||
Args:
|
||||
content: Document bytes
|
||||
content_type: MIME type
|
||||
filename: Optional filename for format detection
|
||||
processor_name: Force specific processor (or None for auto-select)
|
||||
options: Processing options passed to processor
|
||||
|
||||
Returns:
|
||||
ProcessingResult with extracted text and metadata
|
||||
|
||||
Raises:
|
||||
ProcessorError: If no processor found or processing fails
|
||||
"""
|
||||
# Find processor
|
||||
if processor_name:
|
||||
processor = self.get_processor(processor_name)
|
||||
if not processor:
|
||||
raise ProcessorError(
|
||||
f"Processor '{processor_name}' not found. "
|
||||
f"Available: {', '.join(self.list_processors())}"
|
||||
)
|
||||
else:
|
||||
processor = self.find_processor(content_type)
|
||||
if not processor:
|
||||
raise ProcessorError(
|
||||
f"No processor found for type: {content_type}. "
|
||||
f"Registered processors: {', '.join(self.list_processors())}"
|
||||
)
|
||||
|
||||
logger.info(f"Processing with '{processor.name}' processor")
|
||||
|
||||
# Process
|
||||
return await processor.process(content, content_type, filename, options)
|
||||
|
||||
|
||||
# Global registry instance
|
||||
_registry = ProcessorRegistry()
|
||||
|
||||
|
||||
def get_registry() -> ProcessorRegistry:
|
||||
"""Get the global processor registry.
|
||||
|
||||
Returns:
|
||||
Singleton ProcessorRegistry instance
|
||||
"""
|
||||
return _registry
|
||||
Reference in New Issue
Block a user