Merge pull request #190 from yuisheaven/feature/introduce_files_parsing_with_unstructured_service_for_webdav_files_retrieval

Introduce files parsing with "unstructured" service for webdav files retrieval
2025-10-25 19:11:27 +02:00
parent 553e84e5f2 f0e5333e43
commit a19017c686
12 changed files with 1767 additions and 949 deletions
@@ -38,6 +38,8 @@ uv run pytest -m integration -v
 uv run pytest -m "not integration" -v
 ```

+! Hint: If the tests are failing due to missing environment variables, then usually the correct .env has not been created or not correctly configured yet.
+
 ### Load Testing
 ```bash
 # Run benchmark with default settings (10 workers, 30 seconds)
@@ -51,18 +51,33 @@ services:
      - ./tests/fixtures/test_recipe.html:/usr/share/nginx/html/test_recipe.html:ro
      - ./tests/fixtures/nginx.conf:/etc/nginx/nginx.conf:ro

+  unstructured:
+    image: downloads.unstructured.io/unstructured-io/unstructured-api:latest
+    restart: always
+    ports:
+      - 127.0.0.1:8002:8000
+    # Unstructured API runs on port 8000 internally
+    # We expose it on 8002 externally to avoid conflict
+
  mcp:
    build: .
    command: ["--transport", "streamable-http"]
    restart: always
    depends_on:
      - app
+      - unstructured
    ports:
      - 127.0.0.1:8000:8000
    environment:
      - NEXTCLOUD_HOST=http://app:80
      - NEXTCLOUD_USERNAME=admin
      - NEXTCLOUD_PASSWORD=admin
+      - ENABLE_UNSTRUCTURED_PARSING=true
+      - UNSTRUCTURED_API_URL=http://unstructured:8000
+      - UNSTRUCTURED_STRATEGY=hi_res
+      - UNSTRUCTURED_LANGUAGES=deu,eng
+    #volumes:
+      #- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro

  mcp-oauth:
    build: .
@@ -21,3 +21,27 @@ NEXTCLOUD_MCP_SERVER_URL=http://localhost:8000
 # - If these are set, OAuth mode is disabled
 NEXTCLOUD_USERNAME=
 NEXTCLOUD_PASSWORD=
+
+# Document Parsing Configuration
+# Enable/disable unstructured parsing for documents (PDF, DOCX, etc.)
+ENABLE_UNSTRUCTURED_PARSING=true
+
+# Unstructured API endpoint (default for docker-compose setup)
+UNSTRUCTURED_API_URL=http://unstructured:8000
+
+# Parsing strategy for the Unstructured service
+# Valid values: auto, fast, hi_res
+# - auto: Automatically choose the best strategy based on document type
+# - fast: Fast parsing without OCR - best for simple text documents
+# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default)
+UNSTRUCTURED_STRATEGY=hi_res
+
+# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes)
+# Default: eng,deu (English and German)
+# Common language codes:
+#   eng = English       deu = German        fra = French
+#   spa = Spanish       ita = Italian       por = Portuguese
+#   rus = Russian       ara = Arabic        zho = Chinese
+#   jpn = Japanese      kor = Korean
+# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra
+UNSTRUCTURED_LANGUAGES=eng,deu
@@ -0,0 +1,170 @@
+"""HTTP client for Unstructured API."""
+
+import io
+import logging
+from typing import Optional, Tuple
+
+import httpx
+
+from nextcloud_mcp_server.config import (
+    get_unstructured_api_url,
+    get_unstructured_languages,
+    get_unstructured_strategy,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredClient:
+    """Client for interacting with the Unstructured API.
+
+    The Unstructured API provides document parsing capabilities for various formats
+    including PDF, DOCX, images with OCR, and more.
+
+    API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters
+    """
+
+    def __init__(self, api_url: Optional[str] = None, timeout: int = 120):
+        """Initialize the Unstructured API client.
+
+        Args:
+            api_url: Base URL of the Unstructured API. If None, will use config.
+            timeout: Request timeout in seconds (default: 120 for large documents)
+        """
+        self.api_url = api_url or get_unstructured_api_url()
+        self.timeout = timeout
+
+        if not self.api_url:
+            raise ValueError(
+                "Unstructured API URL not configured. "
+                "Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment."
+            )
+
+        logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}")
+
+    async def partition_document(
+        self,
+        content: bytes,
+        filename: str,
+        content_type: Optional[str] = None,
+        strategy: Optional[str] = None,
+        languages: Optional[list[str]] = None,
+        extract_image_block_types: Optional[list[str]] = None,
+    ) -> Tuple[str, dict]:
+        """Parse a document using the Unstructured API.
+
+        Args:
+            content: The document content as bytes
+            filename: The filename (used for format detection)
+            content_type: Optional MIME type
+            strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
+                     If None, uses the value from UNSTRUCTURED_STRATEGY env var.
+            languages: List of language codes for OCR (e.g., ["eng", "deu"]).
+                      If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
+            extract_image_block_types: Types of elements to extract from images
+
+        Returns:
+            Tuple of (parsed_text, metadata) where:
+            - parsed_text: The extracted text content
+            - metadata: Additional metadata about the parsing
+
+        Raises:
+            httpx.HTTPError: If the API request fails
+            Exception: If parsing fails
+        """
+        # Use environment configuration as defaults
+        if strategy is None:
+            strategy = get_unstructured_strategy()
+
+        if languages is None:
+            languages = get_unstructured_languages()
+
+        # Prepare the multipart form data
+        files = {
+            "files": (
+                filename,
+                io.BytesIO(content),
+                content_type or "application/octet-stream",
+            )
+        }
+
+        # Prepare the request data
+        data = {
+            "strategy": strategy,
+            "languages": ",".join(languages),
+        }
+
+        if extract_image_block_types:
+            data["extract_image_block_types"] = ",".join(extract_image_block_types)
+
+        logger.debug(
+            f"Partitioning document '{filename}' with strategy '{strategy}', "
+            f"languages: {languages}"
+        )
+
+        try:
+            async with httpx.AsyncClient(timeout=self.timeout) as client:
+                response = await client.post(
+                    f"{self.api_url}/general/v0/general",
+                    files=files,
+                    data=data,
+                )
+                response.raise_for_status()
+
+                # Parse the response
+                elements = response.json()
+
+                # Extract text from elements
+                # Each element has a "text" field
+                texts = []
+                element_types = {}
+
+                for element in elements:
+                    if "text" in element and element["text"]:
+                        texts.append(element["text"])
+
+                    # Track element types
+                    el_type = element.get("type", "unknown")
+                    element_types[el_type] = element_types.get(el_type, 0) + 1
+
+                parsed_text = "\n\n".join(texts)
+
+                # Collect metadata
+                metadata = {
+                    "element_count": len(elements),
+                    "text_length": len(parsed_text),
+                    "element_types": element_types,
+                    "strategy": strategy,
+                    "languages": languages,
+                    "parsing_method": "unstructured_api",
+                }
+
+                logger.debug(
+                    f"Successfully parsed document: {len(elements)} elements, "
+                    f"{len(parsed_text)} characters"
+                )
+
+                return parsed_text, metadata
+
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error calling Unstructured API: {e}")
+            raise Exception(
+                f"Failed to parse document via Unstructured API: {str(e)}"
+            ) from e
+        except Exception as e:
+            logger.error(f"Unexpected error parsing document: {e}")
+            raise Exception(f"Failed to parse document: {str(e)}") from e
+
+    async def health_check(self) -> bool:
+        """Check if the Unstructured API is available.
+
+        Returns:
+            True if the API is healthy, False otherwise.
+        """
+        try:
+            async with httpx.AsyncClient(timeout=5) as client:
+                response = await client.get(f"{self.api_url}/healthcheck")
+                return response.status_code == 200
+        except Exception as e:
+            logger.warning(f"Unstructured API health check failed: {e}")
+            return False
@@ -1,4 +1,6 @@
 import logging.config
+import os
+from typing import Optional

 LOGGING_CONFIG = {
    "version": 1,
@@ -51,3 +53,88 @@ LOGGING_CONFIG = {

 def setup_logging():
    logging.config.dictConfig(LOGGING_CONFIG)
+
+
+# Document Parsing Configuration
+def get_unstructured_api_url() -> Optional[str]:
+    """Get the Unstructured API URL from environment variables.
+
+    Returns:
+        The Unstructured API URL if parsing is enabled, None otherwise.
+    """
+    enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
+    if not enabled:
+        return None
+
+    return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000")
+
+
+def is_unstructured_parsing_enabled() -> bool:
+    """Check if unstructured document parsing is enabled.
+
+    Returns:
+        True if enabled, False otherwise.
+    """
+    return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
+
+
+def get_unstructured_strategy() -> str:
+    """Get the parsing strategy for the Unstructured API.
+
+    Valid values are:
+    - 'auto': Automatically choose the best strategy (default)
+    - 'fast': Fast parsing without OCR
+    - 'hi_res': High-resolution parsing with OCR for better accuracy
+
+    Returns:
+        The parsing strategy to use.
+    """
+    strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
+    valid_strategies = ["auto", "fast", "hi_res"]
+
+    if strategy not in valid_strategies:
+        logging.warning(
+            f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
+            f"Valid options: {', '.join(valid_strategies)}"
+        )
+        return "hi_res"
+
+    return strategy
+
+
+def get_unstructured_languages() -> list[str]:
+    """Get the OCR languages for the Unstructured API.
+
+    Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
+    Multiple languages can be specified separated by commas.
+
+    Default languages: English (eng) and German (deu)
+
+    Common language codes:
+    - eng: English
+    - deu: German
+    - fra: French
+    - spa: Spanish
+    - ita: Italian
+    - por: Portuguese
+    - rus: Russian
+    - ara: Arabic
+    - zho: Chinese
+    - jpn: Japanese
+    - kor: Korean
+
+    Returns:
+        List of language codes for OCR processing.
+    """
+    languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
+
+    # Split by comma and clean up whitespace
+    languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
+
+    if not languages:
+        logging.warning(
+            "No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu"
+        )
+        return ["eng", "deu"]
+
+    return languages
@@ -2,6 +2,12 @@ import logging

 from mcp.server.fastmcp import Context, FastMCP

+from nextcloud_mcp_server.client import NextcloudClient
+from nextcloud_mcp_server.utils.document_parser import (
+    is_parseable_document,
+    parse_document,
+)
+from nextcloud_mcp_server.config import is_unstructured_parsing_enabled
 from nextcloud_mcp_server.auth import require_scopes
 from nextcloud_mcp_server.context import get_client
 from nextcloud_mcp_server.models import DirectoryListing, FileInfo, SearchFilesResponse
@@ -53,12 +59,49 @@ def configure_webdav_tools(mcp: FastMCP):
            path: Full path to the file to read

        Returns:
-            Dict with path, content, content_type, size, and encoding (if binary)
-            Text files are decoded to UTF-8, binary files are base64 encoded
+            Dict with path, content, content_type, size, and optional parsing metadata
+            - Text files are decoded to UTF-8
+            - Documents (PDF, DOCX, etc.) are parsed and text is extracted
+            - Other binary files are base64 encoded
+
+        Examples:
+            # Read a text file
+            result = await nc_webdav_read_file("Documents/readme.txt")
+            logger.info(result['content'])  # Decoded text content
+
+            # Read a PDF document (automatically parsed)
+            result = await nc_webdav_read_file("Documents/report.pdf")
+            logger.info(result['content'])  # Extracted text from PDF
+            logger.info(result['parsing_metadata'])  # Document parsing info
+
+            # Read a binary file
+            result = await nc_webdav_read_file("Images/photo.jpg")
+            logger.info(result['encoding'])  # 'base64'
        """
        client = get_client(ctx)
        content, content_type = await client.webdav.read_file(path)

+        # Check if this is a parseable document (PDF, DOCX, etc.)
+        if is_unstructured_parsing_enabled() and is_parseable_document(content_type):
+            try:
+                logger.info(f"Parsing document '{path}' of type '{content_type}'")
+                parsed_text, metadata = await parse_document(
+                    content, content_type, filename=path
+                )
+                return {
+                    "path": path,
+                    "content": parsed_text,
+                    "content_type": content_type,
+                    "size": len(content),
+                    "parsed": True,
+                    "parsing_metadata": metadata,
+                }
+            except Exception as e:
+                logger.warning(
+                    f"Failed to parse document '{path}', falling back to base64: {e}"
+                )
+                # Fall through to base64 encoding on parse failure
+
        # For text files, decode content for easier viewing
        if content_type and content_type.startswith("text/"):
            try:
@@ -0,0 +1 @@
+"""Utility functions for the Nextcloud MCP server."""
@@ -0,0 +1,130 @@
+"""Document parsing utilities based on the "unstructured" microservice"""
+
+import logging
+from typing import Optional, Tuple
+
+from nextcloud_mcp_server.config import is_unstructured_parsing_enabled
+
+logger = logging.getLogger(__name__)
+
+# Mapping of MIME types to their corresponding parsing strategies
+PARSEABLE_MIME_TYPES = {
+    # PDF documents
+    "application/pdf": "pdf",
+    # Microsoft Word documents
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
+    "application/msword": "doc",
+    # Microsoft PowerPoint
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
+    "application/vnd.ms-powerpoint": "ppt",
+    # Microsoft Excel
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
+    "application/vnd.ms-excel": "xls",
+    # Other document formats
+    "application/rtf": "rtf",
+    "text/rtf": "rtf",
+    "application/vnd.oasis.opendocument.text": "odt",
+    "application/epub+zip": "epub",
+    # Email formats
+    "message/rfc822": "eml",
+    "application/vnd.ms-outlook": "msg",
+    # Image formats (for OCR)
+    "image/jpeg": "image",
+    "image/png": "image",
+    "image/tiff": "image",
+    "image/bmp": "image",
+}
+
+
+def is_parseable_document(content_type: Optional[str]) -> bool:
+    """Check if a document type can be parsed.
+
+    Args:
+        content_type: The MIME type of the document
+
+    Returns:
+        True if the document can be parsed, False otherwise
+    """
+    if not content_type:
+        return False
+
+    # Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8")
+    base_content_type = content_type.split(";")[0].strip().lower()
+    return base_content_type in PARSEABLE_MIME_TYPES
+
+
+async def parse_document(
+    content: bytes, content_type: Optional[str], filename: Optional[str] = None
+) -> Tuple[str, dict]:
+    """Parse a document using the Unstructured API.
+
+    Args:
+        content: The document content as bytes
+        content_type: The MIME type of the document
+        filename: Optional filename to help with format detection
+
+    Returns:
+        Tuple of (parsed_text, metadata) where:
+        - parsed_text: The extracted text content
+        - metadata: Additional metadata about the parsing
+
+    Raises:
+        ValueError: If the document type is not supported
+        Exception: If parsing fails
+    """
+    if not is_parseable_document(content_type):
+        raise ValueError(f"Document type '{content_type}' is not supported for parsing")
+
+    base_content_type = (
+        content_type.split(";")[0].strip().lower() if content_type else ""
+    )
+    doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown")
+
+    logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})")
+
+    # Check if unstructured parsing is enabled via environment
+    if is_unstructured_parsing_enabled():
+        logger.debug("Using Unstructured API for parsing")
+        try:
+            from nextcloud_mcp_server.client.unstructured_client import (
+                UnstructuredClient,
+            )
+
+            client = UnstructuredClient()
+            # The client will automatically use environment configuration
+            # (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
+            return await client.partition_document(
+                content=content,
+                filename=filename or f"document.{doc_type}",
+                content_type=content_type,
+            )
+        except Exception as e:
+            logger.error(f"Unstructured API parsing failed: {e}")
+            # If unstructured parsing fails, return base64 as fallback
+            import base64
+
+            parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
+            metadata = {
+                "document_type": doc_type,
+                "mime_type": content_type,
+                "element_count": 0,
+                "text_length": len(parsed_text),
+                "parsing_method": "fallback_base64",
+                "error": str(e),
+            }
+            return parsed_text, metadata
+    else:
+        logger.debug(
+            "Unstructured parsing is disabled, returning base64 encoded content as fallback"
+        )
+        import base64
+
+        parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
+        metadata = {
+            "document_type": doc_type,
+            "mime_type": content_type,
+            "element_count": 0,
+            "text_length": len(parsed_text),
+            "parsing_method": "fallback_base64",
+        }
+        return parsed_text, metadata
@@ -91,6 +91,7 @@ dev = [
    "pytest-playwright-asyncio>=0.7.1",
    "pytest-timeout>=2.3.1",
    "ruff>=0.11.13",
+    "reportlab>=4.0.0",
 ]

 [project.scripts]
@@ -0,0 +1,148 @@
+"""Integration tests for Unstructured API functionality."""
+
+import json
+import logging
+import uuid
+from io import BytesIO
+
+import pytest
+from mcp.client.session import ClientSession
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+from nextcloud_mcp_server.client import NextcloudClient
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+async def test_base_path(nc_client: NextcloudClient):
+    """Base path for test files/directories."""
+    test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
+    await nc_client.webdav.create_directory(test_dir)
+    yield test_dir
+    try:
+        await nc_client.webdav.delete_resource(test_dir)
+    except Exception:
+        pass  # Ignore cleanup errors
+
+
+def create_test_pdf(text: str) -> bytes:
+    """Create a simple PDF document with the given text."""
+    buffer = BytesIO()
+    c = canvas.Canvas(buffer, pagesize=letter)
+    c.drawString(100, 750, text)
+    c.save()
+    buffer.seek(0)
+    return buffer.getvalue()
+
+
+@pytest.mark.integration
+async def test_unstructured_api_enabled_parsing(
+    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
+):
+    """Test that documents are parsed using the Unstructured API when enabled."""
+    test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
+    test_text = "This is a test PDF document for Unstructured API parsing"
+
+    try:
+        # Create a simple PDF
+        pdf_content = create_test_pdf(test_text)
+
+        # Upload the PDF
+        await nc_client.webdav.write_file(
+            test_file, pdf_content, content_type="application/pdf"
+        )
+        logger.info(f"Uploaded PDF file: {test_file}")
+
+        # Read the PDF using MCP tool (should parse via Unstructured API)
+        mcp_result = await nc_mcp_client.call_tool(
+            "nc_webdav_read_file", arguments={"path": test_file}
+        )
+
+        # Extract content from the MCP result
+        if hasattr(mcp_result.content[0], "text"):
+            result_text = mcp_result.content[0].text
+        else:
+            # Fallback for other content types
+            result_text = str(mcp_result.content[0])
+
+        # Parse the JSON response
+        result = json.loads(result_text)
+
+        # Verify the result structure
+        assert "path" in result
+        assert "content" in result
+        assert "content_type" in result
+        assert "parsed" in result  # Should be present when parsing succeeds
+
+        # The content should be readable text, not base64
+        content = result["content"]
+        assert isinstance(content, str)
+        assert len(content) > 0
+        assert "test" in content.lower()  # Should contain our test text
+
+        # Should have parsing metadata
+        assert "parsing_metadata" in result
+        parsing_metadata = result["parsing_metadata"]
+        assert parsing_metadata["parsing_method"] == "unstructured_api"
+
+        logger.info("Successfully parsed PDF using Unstructured API")
+
+    finally:
+        # Clean up
+        try:
+            await nc_client.webdav.delete_resource(test_file)
+        except Exception:
+            pass  # Ignore cleanup errors
+
+
+@pytest.mark.integration
+async def test_unstructured_api_with_docx(
+    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
+):
+    """Test Unstructured API with DOCX files."""
+    test_file = f"{test_base_path}/test_unstructured_docx.docx"
+    try:
+        # Create a simple DOCX-like file for testing purposes
+        # Since we're removing python-docx dependency, we'll create a simple file
+        docx_content = (
+            b"This is a mock DOCX file content for testing Unstructured API parsing"
+        )
+
+        # Upload the file
+        await nc_client.webdav.write_file(
+            test_file,
+            docx_content,
+            content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        )
+        logger.info(f"Uploaded DOCX file: {test_file}")
+
+        # Read the file using MCP tool
+        mcp_result = await nc_mcp_client.call_tool(
+            "nc_webdav_read_file", arguments={"path": test_file}
+        )
+
+        # Extract content from the MCP result
+        if hasattr(mcp_result.content[0], "text"):
+            result_text = mcp_result.content[0].text
+        else:
+            # Fallback for other content types
+            result_text = str(mcp_result.content[0])
+
+        # Parse the JSON response
+        result = json.loads(result_text)
+
+        # Verify the result structure
+        assert "path" in result
+        assert "content" in result
+        assert "content_type" in result
+
+        logger.info("Successfully processed DOCX file with Unstructured API")
+
+    finally:
+        # Clean up
+        try:
+            await nc_client.webdav.delete_resource(test_file)
+        except Exception:
+            pass  # Ignore cleanup errors
@@ -0,0 +1,172 @@
+"""Unit tests for Unstructured API configuration."""
+
+import os
+
+import pytest
+
+from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
+from nextcloud_mcp_server.config import (
+    get_unstructured_languages,
+    get_unstructured_strategy,
+)
+
+
+class TestUnstructuredStrategy:
+    """Test strategy configuration."""
+
+    def test_strategy_default(self):
+        """Test that strategy defaults to 'auto'."""
+        os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+        assert get_unstructured_strategy() == "auto"
+
+    def test_strategy_custom_auto(self):
+        """Test custom strategy 'auto'."""
+        os.environ["UNSTRUCTURED_STRATEGY"] = "auto"
+        try:
+            assert get_unstructured_strategy() == "auto"
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+
+    def test_strategy_custom_fast(self):
+        """Test custom strategy 'fast'."""
+        os.environ["UNSTRUCTURED_STRATEGY"] = "fast"
+        try:
+            assert get_unstructured_strategy() == "fast"
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+
+    def test_strategy_custom_hi_res(self):
+        """Test custom strategy 'hi_res'."""
+        os.environ["UNSTRUCTURED_STRATEGY"] = "hi_res"
+        try:
+            assert get_unstructured_strategy() == "hi_res"
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+
+    def test_strategy_invalid_fallback(self, caplog):
+        """Test that invalid strategy falls back to 'hi_res'."""
+        import logging
+
+        os.environ["UNSTRUCTURED_STRATEGY"] = "invalid_strategy"
+        try:
+            # Ensure logging is captured at WARNING level
+            with caplog.at_level(logging.WARNING):
+                strategy = get_unstructured_strategy()
+                assert strategy == "hi_res"
+                assert "Invalid UNSTRUCTURED_STRATEGY" in caplog.text
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+
+    def test_strategy_case_insensitive(self):
+        """Test that strategy is case-insensitive."""
+        os.environ["UNSTRUCTURED_STRATEGY"] = "HI_RES"
+        try:
+            assert get_unstructured_strategy() == "hi_res"
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+
+
+class TestUnstructuredLanguages:
+    """Test language configuration."""
+
+    def test_languages_default(self):
+        """Test that languages default to English and German."""
+        os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+        assert get_unstructured_languages() == ["eng", "deu"]
+
+    def test_languages_single(self):
+        """Test single language configuration."""
+        os.environ["UNSTRUCTURED_LANGUAGES"] = "eng"
+        try:
+            assert get_unstructured_languages() == ["eng"]
+        finally:
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+
+    def test_languages_multiple(self):
+        """Test multiple languages configuration."""
+        os.environ["UNSTRUCTURED_LANGUAGES"] = "eng,fra,spa"
+        try:
+            assert get_unstructured_languages() == ["eng", "fra", "spa"]
+        finally:
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+
+    def test_languages_whitespace_trimming(self):
+        """Test that whitespace is trimmed from language codes."""
+        os.environ["UNSTRUCTURED_LANGUAGES"] = "eng, deu , fra  "
+        try:
+            assert get_unstructured_languages() == ["eng", "deu", "fra"]
+        finally:
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+
+    def test_languages_empty_fallback(self, caplog):
+        """Test that empty languages string falls back to default."""
+        import logging
+
+        os.environ["UNSTRUCTURED_LANGUAGES"] = ""
+        try:
+            with caplog.at_level(logging.WARNING):
+                languages = get_unstructured_languages()
+                assert languages == ["eng", "deu"]
+                assert "No languages specified" in caplog.text
+        finally:
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+
+    def test_languages_only_whitespace_fallback(self, caplog):
+        """Test that whitespace-only string falls back to default."""
+        import logging
+
+        os.environ["UNSTRUCTURED_LANGUAGES"] = "   ,  ,  "
+        try:
+            with caplog.at_level(logging.WARNING):
+                languages = get_unstructured_languages()
+                assert languages == ["eng", "deu"]
+                assert "No languages specified" in caplog.text
+        finally:
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+
+
+class TestUnstructuredClientConfiguration:
+    """Test that UnstructuredClient respects configuration."""
+
+    @pytest.mark.asyncio
+    async def test_client_uses_default_strategy(self):
+        """Test that client uses default strategy from environment."""
+        os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+        os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
+
+        try:
+            _client = UnstructuredClient()
+            # The partition_document method should use get_unstructured_strategy() when strategy is None
+            # We can't test the actual call without a running API, but we can verify the config is read
+            assert get_unstructured_strategy() == "auto"
+        finally:
+            os.environ.pop("UNSTRUCTURED_API_URL", None)
+
+    @pytest.mark.asyncio
+    async def test_client_uses_default_languages(self):
+        """Test that client uses default languages from environment."""
+        os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+        os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
+
+        try:
+            _client = UnstructuredClient()
+            # The partition_document method should use get_unstructured_languages() when languages is None
+            assert get_unstructured_languages() == ["eng", "deu"]
+        finally:
+            os.environ.pop("UNSTRUCTURED_API_URL", None)
+
+    @pytest.mark.asyncio
+    async def test_client_uses_custom_configuration(self):
+        """Test that client uses custom configuration from environment."""
+        os.environ["UNSTRUCTURED_STRATEGY"] = "hi_res"
+        os.environ["UNSTRUCTURED_LANGUAGES"] = "eng,fra,spa"
+        os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
+
+        try:
+            _client = UnstructuredClient()
+            assert get_unstructured_strategy() == "hi_res"
+            assert get_unstructured_languages() == ["eng", "fra", "spa"]
+        finally:
+            os.environ.pop("UNSTRUCTURED_STRATEGY", None)
+            os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
+            os.environ.pop("UNSTRUCTURED_API_URL", None)
				`@@ -0,0 +1 @@`
				`"""Utility functions for the Nextcloud MCP server."""`