diff --git a/docker-compose.yml b/docker-compose.yml index b95179e..4439783 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -65,6 +65,8 @@ services: - NEXTCLOUD_PASSWORD=admin - ENABLE_UNSTRUCTURED_PARSING=true - UNSTRUCTURED_API_URL=http://unstructured:8000 + - UNSTRUCTURED_STRATEGY=hi_res + - UNSTRUCTURED_LANGUAGES=deu,eng #volumes: #- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro diff --git a/env.sample b/env.sample index e0402b6..e3af475 100644 --- a/env.sample +++ b/env.sample @@ -8,3 +8,20 @@ ENABLE_UNSTRUCTURED_PARSING=true # Unstructured API endpoint (default for docker-compose setup) UNSTRUCTURED_API_URL=http://unstructured:8000 + +# Parsing strategy for the Unstructured service +# Valid values: auto, fast, hi_res +# - auto: Automatically choose the best strategy based on document type +# - fast: Fast parsing without OCR - best for simple text documents +# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default) +UNSTRUCTURED_STRATEGY=hi_res + +# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes) +# Default: eng,deu (English and German) +# Common language codes: +# eng = English deu = German fra = French +# spa = Spanish ita = Italian por = Portuguese +# rus = Russian ara = Arabic zho = Chinese +# jpn = Japanese kor = Korean +# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra +UNSTRUCTURED_LANGUAGES=eng,deu diff --git a/nextcloud_mcp_server/client/unstructured_client.py b/nextcloud_mcp_server/client/unstructured_client.py index eac13c5..dd6a289 100644 --- a/nextcloud_mcp_server/client/unstructured_client.py +++ b/nextcloud_mcp_server/client/unstructured_client.py @@ -6,7 +6,11 @@ from typing import Optional, Tuple import httpx -from nextcloud_mcp_server.config import get_unstructured_api_url +from nextcloud_mcp_server.config import ( + get_unstructured_api_url, + get_unstructured_languages, + get_unstructured_strategy, +) logger = logging.getLogger(__name__) @@ -43,7 +47,7 @@ class UnstructuredClient: content: bytes, filename: str, content_type: Optional[str] = None, - strategy: str = "auto", + strategy: Optional[str] = None, languages: Optional[list[str]] = None, extract_image_block_types: Optional[list[str]] = None, ) -> Tuple[str, dict]: @@ -53,8 +57,10 @@ class UnstructuredClient: content: The document content as bytes filename: The filename (used for format detection) content_type: Optional MIME type - strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based) - languages: List of language codes for OCR (e.g., ["eng", "deu"]) + strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based). + If None, uses the value from UNSTRUCTURED_STRATEGY env var. + languages: List of language codes for OCR (e.g., ["eng", "deu"]). + If None, uses the value from UNSTRUCTURED_LANGUAGES env var. extract_image_block_types: Types of elements to extract from images Returns: @@ -66,8 +72,12 @@ class UnstructuredClient: httpx.HTTPError: If the API request fails Exception: If parsing fails """ + # Use environment configuration as defaults + if strategy is None: + strategy = get_unstructured_strategy() + if languages is None: - languages = ["eng"] # Default to English + languages = get_unstructured_languages() # Prepare the multipart form data files = { diff --git a/nextcloud_mcp_server/config.py b/nextcloud_mcp_server/config.py index 6996259..0108990 100644 --- a/nextcloud_mcp_server/config.py +++ b/nextcloud_mcp_server/config.py @@ -60,3 +60,63 @@ def is_unstructured_parsing_enabled() -> bool: True if enabled, False otherwise. """ return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true" + + +def get_unstructured_strategy() -> str: + """Get the parsing strategy for the Unstructured API. + + Valid values are: + - 'auto': Automatically choose the best strategy (default) + - 'fast': Fast parsing without OCR + - 'hi_res': High-resolution parsing with OCR for better accuracy + + Returns: + The parsing strategy to use. + """ + strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower() + valid_strategies = ["auto", "fast", "hi_res"] + + if strategy not in valid_strategies: + logging.warning( + f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. " + f"Valid options: {', '.join(valid_strategies)}" + ) + return "hi_res" + + return strategy + + +def get_unstructured_languages() -> list[str]: + """Get the OCR languages for the Unstructured API. + + Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra'). + Multiple languages can be specified separated by commas. + + Default languages: English (eng) and German (deu) + + Common language codes: + - eng: English + - deu: German + - fra: French + - spa: Spanish + - ita: Italian + - por: Portuguese + - rus: Russian + - ara: Arabic + - zho: Chinese + - jpn: Japanese + - kor: Korean + + Returns: + List of language codes for OCR processing. + """ + languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu") + + # Split by comma and clean up whitespace + languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()] + + if not languages: + logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu") + return ["eng", "deu"] + + return languages diff --git a/nextcloud_mcp_server/utils/document_parser.py b/nextcloud_mcp_server/utils/document_parser.py index 69674a5..145c61d 100644 --- a/nextcloud_mcp_server/utils/document_parser.py +++ b/nextcloud_mcp_server/utils/document_parser.py @@ -86,11 +86,12 @@ async def parse_document( try: from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient client = UnstructuredClient() + # The client will automatically use environment configuration + # (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES) return await client.partition_document( content=content, filename=filename or f"document.{doc_type}", content_type=content_type, - strategy="auto" ) except Exception as e: logger.error(f"Unstructured API parsing failed: {e}")