added envs for unstructured to control OCR quality and OCR languages

2025-10-04 05:21:02 +02:00
parent df5f85e0c6
commit c9a687171a
5 changed files with 96 additions and 6 deletions
@@ -65,6 +65,8 @@ services:
      - NEXTCLOUD_PASSWORD=admin
      - ENABLE_UNSTRUCTURED_PARSING=true
      - UNSTRUCTURED_API_URL=http://unstructured:8000
+      - UNSTRUCTURED_STRATEGY=hi_res
+      - UNSTRUCTURED_LANGUAGES=deu,eng
    #volumes:
      #- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro

@@ -8,3 +8,20 @@ ENABLE_UNSTRUCTURED_PARSING=true

 # Unstructured API endpoint (default for docker-compose setup)
 UNSTRUCTURED_API_URL=http://unstructured:8000
+
+# Parsing strategy for the Unstructured service
+# Valid values: auto, fast, hi_res
+# - auto: Automatically choose the best strategy based on document type
+# - fast: Fast parsing without OCR - best for simple text documents
+# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default)
+UNSTRUCTURED_STRATEGY=hi_res
+
+# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes)
+# Default: eng,deu (English and German)
+# Common language codes:
+#   eng = English       deu = German        fra = French
+#   spa = Spanish       ita = Italian       por = Portuguese
+#   rus = Russian       ara = Arabic        zho = Chinese
+#   jpn = Japanese      kor = Korean
+# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra
+UNSTRUCTURED_LANGUAGES=eng,deu
@@ -6,7 +6,11 @@ from typing import Optional, Tuple

 import httpx

-from nextcloud_mcp_server.config import get_unstructured_api_url
+from nextcloud_mcp_server.config import (
+    get_unstructured_api_url,
+    get_unstructured_languages,
+    get_unstructured_strategy,
+)

 logger = logging.getLogger(__name__)

@@ -43,7 +47,7 @@ class UnstructuredClient:
        content: bytes,
        filename: str,
        content_type: Optional[str] = None,
-        strategy: str = "auto",
+        strategy: Optional[str] = None,
        languages: Optional[list[str]] = None,
        extract_image_block_types: Optional[list[str]] = None,
    ) -> Tuple[str, dict]:
@@ -53,8 +57,10 @@ class UnstructuredClient:
            content: The document content as bytes
            filename: The filename (used for format detection)
            content_type: Optional MIME type
-            strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based)
-            languages: List of language codes for OCR (e.g., ["eng", "deu"])
+            strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
+                     If None, uses the value from UNSTRUCTURED_STRATEGY env var.
+            languages: List of language codes for OCR (e.g., ["eng", "deu"]).
+                      If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
            extract_image_block_types: Types of elements to extract from images
            
        Returns:
@@ -66,8 +72,12 @@ class UnstructuredClient:
            httpx.HTTPError: If the API request fails
            Exception: If parsing fails
        """
+        # Use environment configuration as defaults
+        if strategy is None:
+            strategy = get_unstructured_strategy()
+        
        if languages is None:
-            languages = ["eng"]  # Default to English
+            languages = get_unstructured_languages()
        
        # Prepare the multipart form data
        files = {
@@ -60,3 +60,63 @@ def is_unstructured_parsing_enabled() -> bool:
        True if enabled, False otherwise.
    """
    return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
+
+
+def get_unstructured_strategy() -> str:
+    """Get the parsing strategy for the Unstructured API.
+    
+    Valid values are:
+    - 'auto': Automatically choose the best strategy (default)
+    - 'fast': Fast parsing without OCR
+    - 'hi_res': High-resolution parsing with OCR for better accuracy
+    
+    Returns:
+        The parsing strategy to use.
+    """
+    strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
+    valid_strategies = ["auto", "fast", "hi_res"]
+    
+    if strategy not in valid_strategies:
+        logging.warning(
+            f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
+            f"Valid options: {', '.join(valid_strategies)}"
+        )
+        return "hi_res"
+    
+    return strategy
+
+
+def get_unstructured_languages() -> list[str]:
+    """Get the OCR languages for the Unstructured API.
+    
+    Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
+    Multiple languages can be specified separated by commas.
+    
+    Default languages: English (eng) and German (deu)
+    
+    Common language codes:
+    - eng: English
+    - deu: German
+    - fra: French
+    - spa: Spanish
+    - ita: Italian
+    - por: Portuguese
+    - rus: Russian
+    - ara: Arabic
+    - zho: Chinese
+    - jpn: Japanese
+    - kor: Korean
+    
+    Returns:
+        List of language codes for OCR processing.
+    """
+    languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
+    
+    # Split by comma and clean up whitespace
+    languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
+    
+    if not languages:
+        logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu")
+        return ["eng", "deu"]
+    
+    return languages
@@ -86,11 +86,12 @@ async def parse_document(
        try:
            from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
            client = UnstructuredClient()
+            # The client will automatically use environment configuration
+            # (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
            return await client.partition_document(
                content=content,
                filename=filename or f"document.{doc_type}",
                content_type=content_type,
-                strategy="auto"
            )
        except Exception as e:
            logger.error(f"Unstructured API parsing failed: {e}")