ran ruff format via uv

2025-10-05 02:16:42 +02:00
parent c9a687171a
commit 3ff6346c03
5 changed files with 87 additions and 67 deletions
@@ -17,31 +17,31 @@ logger = logging.getLogger(__name__)

 class UnstructuredClient:
    """Client for interacting with the Unstructured API.
-    
+
    The Unstructured API provides document parsing capabilities for various formats
    including PDF, DOCX, images with OCR, and more.
-    
+
    API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters
    """
-    
+
    def __init__(self, api_url: Optional[str] = None, timeout: int = 120):
        """Initialize the Unstructured API client.
-        
+
        Args:
            api_url: Base URL of the Unstructured API. If None, will use config.
            timeout: Request timeout in seconds (default: 120 for large documents)
        """
        self.api_url = api_url or get_unstructured_api_url()
        self.timeout = timeout
-        
+
        if not self.api_url:
            raise ValueError(
                "Unstructured API URL not configured. "
                "Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment."
            )
-        
+
        logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}")
-    
+
    async def partition_document(
        self,
        content: bytes,
@@ -52,7 +52,7 @@ class UnstructuredClient:
        extract_image_block_types: Optional[list[str]] = None,
    ) -> Tuple[str, dict]:
        """Parse a document using the Unstructured API.
-        
+
        Args:
            content: The document content as bytes
            filename: The filename (used for format detection)
@@ -62,12 +62,12 @@ class UnstructuredClient:
            languages: List of language codes for OCR (e.g., ["eng", "deu"]).
                      If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
            extract_image_block_types: Types of elements to extract from images
-            
+
        Returns:
            Tuple of (parsed_text, metadata) where:
            - parsed_text: The extracted text content
            - metadata: Additional metadata about the parsing
-            
+
        Raises:
            httpx.HTTPError: If the API request fails
            Exception: If parsing fails
@@ -75,29 +75,33 @@ class UnstructuredClient:
        # Use environment configuration as defaults
        if strategy is None:
            strategy = get_unstructured_strategy()
-        
+
        if languages is None:
            languages = get_unstructured_languages()
-        
+
        # Prepare the multipart form data
        files = {
-            "files": (filename, io.BytesIO(content), content_type or "application/octet-stream")
+            "files": (
+                filename,
+                io.BytesIO(content),
+                content_type or "application/octet-stream",
+            )
        }
-        
+
        # Prepare the request data
        data = {
            "strategy": strategy,
            "languages": ",".join(languages),
        }
-        
+
        if extract_image_block_types:
            data["extract_image_block_types"] = ",".join(extract_image_block_types)
-        
+
        logger.debug(
            f"Partitioning document '{filename}' with strategy '{strategy}', "
            f"languages: {languages}"
        )
-        
+
        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.post(
@@ -106,25 +110,25 @@ class UnstructuredClient:
                    data=data,
                )
                response.raise_for_status()
-                
+
                # Parse the response
                elements = response.json()
-                
+
                # Extract text from elements
                # Each element has a "text" field
                texts = []
                element_types = {}
-                
+
                for element in elements:
                    if "text" in element and element["text"]:
                        texts.append(element["text"])
-                    
+
                    # Track element types
                    el_type = element.get("type", "unknown")
                    element_types[el_type] = element_types.get(el_type, 0) + 1
-                
+
                parsed_text = "\n\n".join(texts)
-                
+
                # Collect metadata
                metadata = {
                    "element_count": len(elements),
@@ -132,26 +136,28 @@ class UnstructuredClient:
                    "element_types": element_types,
                    "strategy": strategy,
                    "languages": languages,
-                    "parsing_method": "unstructured_api"
+                    "parsing_method": "unstructured_api",
                }
-                
+
                logger.debug(
                    f"Successfully parsed document: {len(elements)} elements, "
                    f"{len(parsed_text)} characters"
                )
-                
+
                return parsed_text, metadata
-                
+
        except httpx.HTTPError as e:
            logger.error(f"HTTP error calling Unstructured API: {e}")
-            raise Exception(f"Failed to parse document via Unstructured API: {str(e)}") from e
+            raise Exception(
+                f"Failed to parse document via Unstructured API: {str(e)}"
+            ) from e
        except Exception as e:
            logger.error(f"Unexpected error parsing document: {e}")
            raise Exception(f"Failed to parse document: {str(e)}") from e
-    
+
    async def health_check(self) -> bool:
        """Check if the Unstructured API is available.
-        
+
        Returns:
            True if the API is healthy, False otherwise.
        """
@@ -161,4 +167,4 @@ class UnstructuredClient:
                return response.status_code == 200
        except Exception as e:
            logger.warning(f"Unstructured API health check failed: {e}")
-            return False
+            return False
@@ -42,20 +42,20 @@ def setup_logging():
 # Document Parsing Configuration
 def get_unstructured_api_url() -> Optional[str]:
    """Get the Unstructured API URL from environment variables.
-    
+
    Returns:
        The Unstructured API URL if parsing is enabled, None otherwise.
    """
    enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
    if not enabled:
        return None
-    
+
    return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000")


 def is_unstructured_parsing_enabled() -> bool:
    """Check if unstructured document parsing is enabled.
-    
+
    Returns:
        True if enabled, False otherwise.
    """
@@ -64,36 +64,36 @@ def is_unstructured_parsing_enabled() -> bool:

 def get_unstructured_strategy() -> str:
    """Get the parsing strategy for the Unstructured API.
-    
+
    Valid values are:
    - 'auto': Automatically choose the best strategy (default)
    - 'fast': Fast parsing without OCR
    - 'hi_res': High-resolution parsing with OCR for better accuracy
-    
+
    Returns:
        The parsing strategy to use.
    """
    strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
    valid_strategies = ["auto", "fast", "hi_res"]
-    
+
    if strategy not in valid_strategies:
        logging.warning(
            f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
            f"Valid options: {', '.join(valid_strategies)}"
        )
        return "hi_res"
-    
+
    return strategy


 def get_unstructured_languages() -> list[str]:
    """Get the OCR languages for the Unstructured API.
-    
+
    Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
    Multiple languages can be specified separated by commas.
-    
+
    Default languages: English (eng) and German (deu)
-    
+
    Common language codes:
    - eng: English
    - deu: German
@@ -106,17 +106,19 @@ def get_unstructured_languages() -> list[str]:
    - zho: Chinese
    - jpn: Japanese
    - kor: Korean
-    
+
    Returns:
        List of language codes for OCR processing.
    """
    languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
-    
+
    # Split by comma and clean up whitespace
    languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
-    
+
    if not languages:
-        logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu")
+        logging.warning(
+            "No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu"
+        )
        return ["eng", "deu"]
-    
+
    return languages
@@ -3,7 +3,10 @@ import logging
 from mcp.server.fastmcp import Context, FastMCP

 from nextcloud_mcp_server.client import NextcloudClient
-from nextcloud_mcp_server.utils.document_parser import is_parseable_document, parse_document
+from nextcloud_mcp_server.utils.document_parser import (
+    is_parseable_document,
+    parse_document,
+)
 from nextcloud_mcp_server.config import is_unstructured_parsing_enabled

 logger = logging.getLogger(__name__)
@@ -62,7 +65,7 @@ def configure_webdav_tools(mcp: FastMCP):
        content, content_type = await client.webdav.read_file(path)

        # Check if this is a parseable document (PDF, DOCX, etc.)
-        if (is_unstructured_parsing_enabled() and is_parseable_document(content_type)):
+        if is_unstructured_parsing_enabled() and is_parseable_document(content_type):
            try:
                logger.info(f"Parsing document '{path}' of type '{content_type}'")
                parsed_text, metadata = await parse_document(
@@ -1 +1 @@
-"""Utility functions for the Nextcloud MCP server."""
+"""Utility functions for the Nextcloud MCP server."""
@@ -35,56 +35,61 @@ PARSEABLE_MIME_TYPES = {
    "image/bmp": "image",
 }

+
 def is_parseable_document(content_type: Optional[str]) -> bool:
    """Check if a document type can be parsed.
-    
+
    Args:
        content_type: The MIME type of the document
-         
+
    Returns:
        True if the document can be parsed, False otherwise
    """
    if not content_type:
        return False
-    
+
    # Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8")
    base_content_type = content_type.split(";")[0].strip().lower()
    return base_content_type in PARSEABLE_MIME_TYPES

+
 async def parse_document(
-    content: bytes,
-    content_type: Optional[str],
-    filename: Optional[str] = None
+    content: bytes, content_type: Optional[str], filename: Optional[str] = None
 ) -> Tuple[str, dict]:
    """Parse a document using the Unstructured API.
-    
+
    Args:
        content: The document content as bytes
        content_type: The MIME type of the document
        filename: Optional filename to help with format detection
-         
+
    Returns:
        Tuple of (parsed_text, metadata) where:
        - parsed_text: The extracted text content
        - metadata: Additional metadata about the parsing
-         
+
    Raises:
        ValueError: If the document type is not supported
        Exception: If parsing fails
    """
    if not is_parseable_document(content_type):
        raise ValueError(f"Document type '{content_type}' is not supported for parsing")
-    
-    base_content_type = content_type.split(";")[0].strip().lower() if content_type else ""
+
+    base_content_type = (
+        content_type.split(";")[0].strip().lower() if content_type else ""
+    )
    doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown")
-    
+
    logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})")
-    
+
    # Check if unstructured parsing is enabled via environment
    if is_unstructured_parsing_enabled():
        logger.debug("Using Unstructured API for parsing")
        try:
-            from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
+            from nextcloud_mcp_server.client.unstructured_client import (
+                UnstructuredClient,
+            )
+
            client = UnstructuredClient()
            # The client will automatically use environment configuration
            # (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
@@ -97,6 +102,7 @@ async def parse_document(
            logger.error(f"Unstructured API parsing failed: {e}")
            # If unstructured parsing fails, return base64 as fallback
            import base64
+
            parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
            metadata = {
                "document_type": doc_type,
@@ -104,18 +110,21 @@ async def parse_document(
                "element_count": 0,
                "text_length": len(parsed_text),
                "parsing_method": "fallback_base64",
-                "error": str(e)
+                "error": str(e),
            }
            return parsed_text, metadata
    else:
-        logger.debug("Unstructured parsing is disabled, returning base64 encoded content as fallback")
+        logger.debug(
+            "Unstructured parsing is disabled, returning base64 encoded content as fallback"
+        )
        import base64
+
        parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
        metadata = {
            "document_type": doc_type,
            "mime_type": content_type,
            "element_count": 0,
            "text_length": len(parsed_text),
-            "parsing_method": "fallback_base64"
+            "parsing_method": "fallback_base64",
        }
-        return parsed_text, metadata
+        return parsed_text, metadata