From 3ff6346c03728ba366a52dcb1b48065de2b1b750 Mon Sep 17 00:00:00 2001 From: yuisheaven <91854357+yuisheaven@users.noreply.github.com> Date: Sun, 5 Oct 2025 02:16:42 +0200 Subject: [PATCH] ran ruff format via uv --- .../client/unstructured_client.py | 68 ++++++++++--------- nextcloud_mcp_server/config.py | 32 +++++---- nextcloud_mcp_server/server/webdav.py | 7 +- nextcloud_mcp_server/utils/__init__.py | 2 +- nextcloud_mcp_server/utils/document_parser.py | 45 +++++++----- 5 files changed, 87 insertions(+), 67 deletions(-) diff --git a/nextcloud_mcp_server/client/unstructured_client.py b/nextcloud_mcp_server/client/unstructured_client.py index dd6a289..b425a0c 100644 --- a/nextcloud_mcp_server/client/unstructured_client.py +++ b/nextcloud_mcp_server/client/unstructured_client.py @@ -17,31 +17,31 @@ logger = logging.getLogger(__name__) class UnstructuredClient: """Client for interacting with the Unstructured API. - + The Unstructured API provides document parsing capabilities for various formats including PDF, DOCX, images with OCR, and more. - + API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters """ - + def __init__(self, api_url: Optional[str] = None, timeout: int = 120): """Initialize the Unstructured API client. - + Args: api_url: Base URL of the Unstructured API. If None, will use config. timeout: Request timeout in seconds (default: 120 for large documents) """ self.api_url = api_url or get_unstructured_api_url() self.timeout = timeout - + if not self.api_url: raise ValueError( "Unstructured API URL not configured. " "Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment." ) - + logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}") - + async def partition_document( self, content: bytes, @@ -52,7 +52,7 @@ class UnstructuredClient: extract_image_block_types: Optional[list[str]] = None, ) -> Tuple[str, dict]: """Parse a document using the Unstructured API. - + Args: content: The document content as bytes filename: The filename (used for format detection) @@ -62,12 +62,12 @@ class UnstructuredClient: languages: List of language codes for OCR (e.g., ["eng", "deu"]). If None, uses the value from UNSTRUCTURED_LANGUAGES env var. extract_image_block_types: Types of elements to extract from images - + Returns: Tuple of (parsed_text, metadata) where: - parsed_text: The extracted text content - metadata: Additional metadata about the parsing - + Raises: httpx.HTTPError: If the API request fails Exception: If parsing fails @@ -75,29 +75,33 @@ class UnstructuredClient: # Use environment configuration as defaults if strategy is None: strategy = get_unstructured_strategy() - + if languages is None: languages = get_unstructured_languages() - + # Prepare the multipart form data files = { - "files": (filename, io.BytesIO(content), content_type or "application/octet-stream") + "files": ( + filename, + io.BytesIO(content), + content_type or "application/octet-stream", + ) } - + # Prepare the request data data = { "strategy": strategy, "languages": ",".join(languages), } - + if extract_image_block_types: data["extract_image_block_types"] = ",".join(extract_image_block_types) - + logger.debug( f"Partitioning document '{filename}' with strategy '{strategy}', " f"languages: {languages}" ) - + try: async with httpx.AsyncClient(timeout=self.timeout) as client: response = await client.post( @@ -106,25 +110,25 @@ class UnstructuredClient: data=data, ) response.raise_for_status() - + # Parse the response elements = response.json() - + # Extract text from elements # Each element has a "text" field texts = [] element_types = {} - + for element in elements: if "text" in element and element["text"]: texts.append(element["text"]) - + # Track element types el_type = element.get("type", "unknown") element_types[el_type] = element_types.get(el_type, 0) + 1 - + parsed_text = "\n\n".join(texts) - + # Collect metadata metadata = { "element_count": len(elements), @@ -132,26 +136,28 @@ class UnstructuredClient: "element_types": element_types, "strategy": strategy, "languages": languages, - "parsing_method": "unstructured_api" + "parsing_method": "unstructured_api", } - + logger.debug( f"Successfully parsed document: {len(elements)} elements, " f"{len(parsed_text)} characters" ) - + return parsed_text, metadata - + except httpx.HTTPError as e: logger.error(f"HTTP error calling Unstructured API: {e}") - raise Exception(f"Failed to parse document via Unstructured API: {str(e)}") from e + raise Exception( + f"Failed to parse document via Unstructured API: {str(e)}" + ) from e except Exception as e: logger.error(f"Unexpected error parsing document: {e}") raise Exception(f"Failed to parse document: {str(e)}") from e - + async def health_check(self) -> bool: """Check if the Unstructured API is available. - + Returns: True if the API is healthy, False otherwise. """ @@ -161,4 +167,4 @@ class UnstructuredClient: return response.status_code == 200 except Exception as e: logger.warning(f"Unstructured API health check failed: {e}") - return False \ No newline at end of file + return False diff --git a/nextcloud_mcp_server/config.py b/nextcloud_mcp_server/config.py index 0108990..d82310e 100644 --- a/nextcloud_mcp_server/config.py +++ b/nextcloud_mcp_server/config.py @@ -42,20 +42,20 @@ def setup_logging(): # Document Parsing Configuration def get_unstructured_api_url() -> Optional[str]: """Get the Unstructured API URL from environment variables. - + Returns: The Unstructured API URL if parsing is enabled, None otherwise. """ enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true" if not enabled: return None - + return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000") def is_unstructured_parsing_enabled() -> bool: """Check if unstructured document parsing is enabled. - + Returns: True if enabled, False otherwise. """ @@ -64,36 +64,36 @@ def is_unstructured_parsing_enabled() -> bool: def get_unstructured_strategy() -> str: """Get the parsing strategy for the Unstructured API. - + Valid values are: - 'auto': Automatically choose the best strategy (default) - 'fast': Fast parsing without OCR - 'hi_res': High-resolution parsing with OCR for better accuracy - + Returns: The parsing strategy to use. """ strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower() valid_strategies = ["auto", "fast", "hi_res"] - + if strategy not in valid_strategies: logging.warning( f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. " f"Valid options: {', '.join(valid_strategies)}" ) return "hi_res" - + return strategy def get_unstructured_languages() -> list[str]: """Get the OCR languages for the Unstructured API. - + Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra'). Multiple languages can be specified separated by commas. - + Default languages: English (eng) and German (deu) - + Common language codes: - eng: English - deu: German @@ -106,17 +106,19 @@ def get_unstructured_languages() -> list[str]: - zho: Chinese - jpn: Japanese - kor: Korean - + Returns: List of language codes for OCR processing. """ languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu") - + # Split by comma and clean up whitespace languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()] - + if not languages: - logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu") + logging.warning( + "No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu" + ) return ["eng", "deu"] - + return languages diff --git a/nextcloud_mcp_server/server/webdav.py b/nextcloud_mcp_server/server/webdav.py index 66433e3..0ceaa96 100644 --- a/nextcloud_mcp_server/server/webdav.py +++ b/nextcloud_mcp_server/server/webdav.py @@ -3,7 +3,10 @@ import logging from mcp.server.fastmcp import Context, FastMCP from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.utils.document_parser import is_parseable_document, parse_document +from nextcloud_mcp_server.utils.document_parser import ( + is_parseable_document, + parse_document, +) from nextcloud_mcp_server.config import is_unstructured_parsing_enabled logger = logging.getLogger(__name__) @@ -62,7 +65,7 @@ def configure_webdav_tools(mcp: FastMCP): content, content_type = await client.webdav.read_file(path) # Check if this is a parseable document (PDF, DOCX, etc.) - if (is_unstructured_parsing_enabled() and is_parseable_document(content_type)): + if is_unstructured_parsing_enabled() and is_parseable_document(content_type): try: logger.info(f"Parsing document '{path}' of type '{content_type}'") parsed_text, metadata = await parse_document( diff --git a/nextcloud_mcp_server/utils/__init__.py b/nextcloud_mcp_server/utils/__init__.py index 816a761..8a4b271 100644 --- a/nextcloud_mcp_server/utils/__init__.py +++ b/nextcloud_mcp_server/utils/__init__.py @@ -1 +1 @@ -"""Utility functions for the Nextcloud MCP server.""" \ No newline at end of file +"""Utility functions for the Nextcloud MCP server.""" diff --git a/nextcloud_mcp_server/utils/document_parser.py b/nextcloud_mcp_server/utils/document_parser.py index 145c61d..b7c809f 100644 --- a/nextcloud_mcp_server/utils/document_parser.py +++ b/nextcloud_mcp_server/utils/document_parser.py @@ -35,56 +35,61 @@ PARSEABLE_MIME_TYPES = { "image/bmp": "image", } + def is_parseable_document(content_type: Optional[str]) -> bool: """Check if a document type can be parsed. - + Args: content_type: The MIME type of the document - + Returns: True if the document can be parsed, False otherwise """ if not content_type: return False - + # Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8") base_content_type = content_type.split(";")[0].strip().lower() return base_content_type in PARSEABLE_MIME_TYPES + async def parse_document( - content: bytes, - content_type: Optional[str], - filename: Optional[str] = None + content: bytes, content_type: Optional[str], filename: Optional[str] = None ) -> Tuple[str, dict]: """Parse a document using the Unstructured API. - + Args: content: The document content as bytes content_type: The MIME type of the document filename: Optional filename to help with format detection - + Returns: Tuple of (parsed_text, metadata) where: - parsed_text: The extracted text content - metadata: Additional metadata about the parsing - + Raises: ValueError: If the document type is not supported Exception: If parsing fails """ if not is_parseable_document(content_type): raise ValueError(f"Document type '{content_type}' is not supported for parsing") - - base_content_type = content_type.split(";")[0].strip().lower() if content_type else "" + + base_content_type = ( + content_type.split(";")[0].strip().lower() if content_type else "" + ) doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown") - + logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})") - + # Check if unstructured parsing is enabled via environment if is_unstructured_parsing_enabled(): logger.debug("Using Unstructured API for parsing") try: - from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient + from nextcloud_mcp_server.client.unstructured_client import ( + UnstructuredClient, + ) + client = UnstructuredClient() # The client will automatically use environment configuration # (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES) @@ -97,6 +102,7 @@ async def parse_document( logger.error(f"Unstructured API parsing failed: {e}") # If unstructured parsing fails, return base64 as fallback import base64 + parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..." metadata = { "document_type": doc_type, @@ -104,18 +110,21 @@ async def parse_document( "element_count": 0, "text_length": len(parsed_text), "parsing_method": "fallback_base64", - "error": str(e) + "error": str(e), } return parsed_text, metadata else: - logger.debug("Unstructured parsing is disabled, returning base64 encoded content as fallback") + logger.debug( + "Unstructured parsing is disabled, returning base64 encoded content as fallback" + ) import base64 + parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..." metadata = { "document_type": doc_type, "mime_type": content_type, "element_count": 0, "text_length": len(parsed_text), - "parsing_method": "fallback_base64" + "parsing_method": "fallback_base64", } - return parsed_text, metadata \ No newline at end of file + return parsed_text, metadata