added envs for unstructured to control OCR quality and OCR languages
This commit is contained in:
@@ -65,6 +65,8 @@ services:
|
||||
- NEXTCLOUD_PASSWORD=admin
|
||||
- ENABLE_UNSTRUCTURED_PARSING=true
|
||||
- UNSTRUCTURED_API_URL=http://unstructured:8000
|
||||
- UNSTRUCTURED_STRATEGY=hi_res
|
||||
- UNSTRUCTURED_LANGUAGES=deu,eng
|
||||
#volumes:
|
||||
#- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro
|
||||
|
||||
|
||||
+17
@@ -8,3 +8,20 @@ ENABLE_UNSTRUCTURED_PARSING=true
|
||||
|
||||
# Unstructured API endpoint (default for docker-compose setup)
|
||||
UNSTRUCTURED_API_URL=http://unstructured:8000
|
||||
|
||||
# Parsing strategy for the Unstructured service
|
||||
# Valid values: auto, fast, hi_res
|
||||
# - auto: Automatically choose the best strategy based on document type
|
||||
# - fast: Fast parsing without OCR - best for simple text documents
|
||||
# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default)
|
||||
UNSTRUCTURED_STRATEGY=hi_res
|
||||
|
||||
# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes)
|
||||
# Default: eng,deu (English and German)
|
||||
# Common language codes:
|
||||
# eng = English deu = German fra = French
|
||||
# spa = Spanish ita = Italian por = Portuguese
|
||||
# rus = Russian ara = Arabic zho = Chinese
|
||||
# jpn = Japanese kor = Korean
|
||||
# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra
|
||||
UNSTRUCTURED_LANGUAGES=eng,deu
|
||||
|
||||
@@ -6,7 +6,11 @@ from typing import Optional, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
from nextcloud_mcp_server.config import get_unstructured_api_url
|
||||
from nextcloud_mcp_server.config import (
|
||||
get_unstructured_api_url,
|
||||
get_unstructured_languages,
|
||||
get_unstructured_strategy,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -43,7 +47,7 @@ class UnstructuredClient:
|
||||
content: bytes,
|
||||
filename: str,
|
||||
content_type: Optional[str] = None,
|
||||
strategy: str = "auto",
|
||||
strategy: Optional[str] = None,
|
||||
languages: Optional[list[str]] = None,
|
||||
extract_image_block_types: Optional[list[str]] = None,
|
||||
) -> Tuple[str, dict]:
|
||||
@@ -53,8 +57,10 @@ class UnstructuredClient:
|
||||
content: The document content as bytes
|
||||
filename: The filename (used for format detection)
|
||||
content_type: Optional MIME type
|
||||
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based)
|
||||
languages: List of language codes for OCR (e.g., ["eng", "deu"])
|
||||
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
|
||||
If None, uses the value from UNSTRUCTURED_STRATEGY env var.
|
||||
languages: List of language codes for OCR (e.g., ["eng", "deu"]).
|
||||
If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
|
||||
extract_image_block_types: Types of elements to extract from images
|
||||
|
||||
Returns:
|
||||
@@ -66,8 +72,12 @@ class UnstructuredClient:
|
||||
httpx.HTTPError: If the API request fails
|
||||
Exception: If parsing fails
|
||||
"""
|
||||
# Use environment configuration as defaults
|
||||
if strategy is None:
|
||||
strategy = get_unstructured_strategy()
|
||||
|
||||
if languages is None:
|
||||
languages = ["eng"] # Default to English
|
||||
languages = get_unstructured_languages()
|
||||
|
||||
# Prepare the multipart form data
|
||||
files = {
|
||||
|
||||
@@ -60,3 +60,63 @@ def is_unstructured_parsing_enabled() -> bool:
|
||||
True if enabled, False otherwise.
|
||||
"""
|
||||
return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
|
||||
|
||||
|
||||
def get_unstructured_strategy() -> str:
|
||||
"""Get the parsing strategy for the Unstructured API.
|
||||
|
||||
Valid values are:
|
||||
- 'auto': Automatically choose the best strategy (default)
|
||||
- 'fast': Fast parsing without OCR
|
||||
- 'hi_res': High-resolution parsing with OCR for better accuracy
|
||||
|
||||
Returns:
|
||||
The parsing strategy to use.
|
||||
"""
|
||||
strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
|
||||
valid_strategies = ["auto", "fast", "hi_res"]
|
||||
|
||||
if strategy not in valid_strategies:
|
||||
logging.warning(
|
||||
f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
|
||||
f"Valid options: {', '.join(valid_strategies)}"
|
||||
)
|
||||
return "hi_res"
|
||||
|
||||
return strategy
|
||||
|
||||
|
||||
def get_unstructured_languages() -> list[str]:
|
||||
"""Get the OCR languages for the Unstructured API.
|
||||
|
||||
Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
|
||||
Multiple languages can be specified separated by commas.
|
||||
|
||||
Default languages: English (eng) and German (deu)
|
||||
|
||||
Common language codes:
|
||||
- eng: English
|
||||
- deu: German
|
||||
- fra: French
|
||||
- spa: Spanish
|
||||
- ita: Italian
|
||||
- por: Portuguese
|
||||
- rus: Russian
|
||||
- ara: Arabic
|
||||
- zho: Chinese
|
||||
- jpn: Japanese
|
||||
- kor: Korean
|
||||
|
||||
Returns:
|
||||
List of language codes for OCR processing.
|
||||
"""
|
||||
languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
|
||||
|
||||
# Split by comma and clean up whitespace
|
||||
languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
|
||||
|
||||
if not languages:
|
||||
logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu")
|
||||
return ["eng", "deu"]
|
||||
|
||||
return languages
|
||||
|
||||
@@ -86,11 +86,12 @@ async def parse_document(
|
||||
try:
|
||||
from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
|
||||
client = UnstructuredClient()
|
||||
# The client will automatically use environment configuration
|
||||
# (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
|
||||
return await client.partition_document(
|
||||
content=content,
|
||||
filename=filename or f"document.{doc_type}",
|
||||
content_type=content_type,
|
||||
strategy="auto"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Unstructured API parsing failed: {e}")
|
||||
|
||||
Reference in New Issue
Block a user