diff --git a/nextcloud_mcp_server/client/unstructured_client.py b/nextcloud_mcp_server/client/unstructured_client.py new file mode 100644 index 0000000..eac13c5 --- /dev/null +++ b/nextcloud_mcp_server/client/unstructured_client.py @@ -0,0 +1,154 @@ +"""HTTP client for Unstructured API.""" + +import io +import logging +from typing import Optional, Tuple + +import httpx + +from nextcloud_mcp_server.config import get_unstructured_api_url + +logger = logging.getLogger(__name__) + + +class UnstructuredClient: + """Client for interacting with the Unstructured API. + + The Unstructured API provides document parsing capabilities for various formats + including PDF, DOCX, images with OCR, and more. + + API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters + """ + + def __init__(self, api_url: Optional[str] = None, timeout: int = 120): + """Initialize the Unstructured API client. + + Args: + api_url: Base URL of the Unstructured API. If None, will use config. + timeout: Request timeout in seconds (default: 120 for large documents) + """ + self.api_url = api_url or get_unstructured_api_url() + self.timeout = timeout + + if not self.api_url: + raise ValueError( + "Unstructured API URL not configured. " + "Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment." + ) + + logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}") + + async def partition_document( + self, + content: bytes, + filename: str, + content_type: Optional[str] = None, + strategy: str = "auto", + languages: Optional[list[str]] = None, + extract_image_block_types: Optional[list[str]] = None, + ) -> Tuple[str, dict]: + """Parse a document using the Unstructured API. + + Args: + content: The document content as bytes + filename: The filename (used for format detection) + content_type: Optional MIME type + strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based) + languages: List of language codes for OCR (e.g., ["eng", "deu"]) + extract_image_block_types: Types of elements to extract from images + + Returns: + Tuple of (parsed_text, metadata) where: + - parsed_text: The extracted text content + - metadata: Additional metadata about the parsing + + Raises: + httpx.HTTPError: If the API request fails + Exception: If parsing fails + """ + if languages is None: + languages = ["eng"] # Default to English + + # Prepare the multipart form data + files = { + "files": (filename, io.BytesIO(content), content_type or "application/octet-stream") + } + + # Prepare the request data + data = { + "strategy": strategy, + "languages": ",".join(languages), + } + + if extract_image_block_types: + data["extract_image_block_types"] = ",".join(extract_image_block_types) + + logger.debug( + f"Partitioning document '{filename}' with strategy '{strategy}', " + f"languages: {languages}" + ) + + try: + async with httpx.AsyncClient(timeout=self.timeout) as client: + response = await client.post( + f"{self.api_url}/general/v0/general", + files=files, + data=data, + ) + response.raise_for_status() + + # Parse the response + elements = response.json() + + # Extract text from elements + # Each element has a "text" field + texts = [] + element_types = {} + + for element in elements: + if "text" in element and element["text"]: + texts.append(element["text"]) + + # Track element types + el_type = element.get("type", "unknown") + element_types[el_type] = element_types.get(el_type, 0) + 1 + + parsed_text = "\n\n".join(texts) + + # Collect metadata + metadata = { + "element_count": len(elements), + "text_length": len(parsed_text), + "element_types": element_types, + "strategy": strategy, + "languages": languages, + "parsing_method": "unstructured_api" + } + + logger.debug( + f"Successfully parsed document: {len(elements)} elements, " + f"{len(parsed_text)} characters" + ) + + return parsed_text, metadata + + except httpx.HTTPError as e: + logger.error(f"HTTP error calling Unstructured API: {e}") + raise Exception(f"Failed to parse document via Unstructured API: {str(e)}") from e + except Exception as e: + logger.error(f"Unexpected error parsing document: {e}") + raise Exception(f"Failed to parse document: {str(e)}") from e + + async def health_check(self) -> bool: + """Check if the Unstructured API is available. + + Returns: + True if the API is healthy, False otherwise. + """ + try: + async with httpx.AsyncClient(timeout=5) as client: + response = await client.get(f"{self.api_url}/healthcheck") + return response.status_code == 200 + except Exception as e: + logger.warning(f"Unstructured API health check failed: {e}") + return False \ No newline at end of file diff --git a/nextcloud_mcp_server/config.py b/nextcloud_mcp_server/config.py index c37ce94..6996259 100644 --- a/nextcloud_mcp_server/config.py +++ b/nextcloud_mcp_server/config.py @@ -1,4 +1,6 @@ import logging.config +import os +from typing import Optional LOGGING_CONFIG = { "version": 1, @@ -35,3 +37,26 @@ LOGGING_CONFIG = { def setup_logging(): logging.config.dictConfig(LOGGING_CONFIG) + + +# Document Parsing Configuration +def get_unstructured_api_url() -> Optional[str]: + """Get the Unstructured API URL from environment variables. + + Returns: + The Unstructured API URL if parsing is enabled, None otherwise. + """ + enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true" + if not enabled: + return None + + return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000") + + +def is_unstructured_parsing_enabled() -> bool: + """Check if unstructured document parsing is enabled. + + Returns: + True if enabled, False otherwise. + """ + return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true" diff --git a/nextcloud_mcp_server/server/webdav.py b/nextcloud_mcp_server/server/webdav.py index 6fa6db6..66433e3 100644 --- a/nextcloud_mcp_server/server/webdav.py +++ b/nextcloud_mcp_server/server/webdav.py @@ -3,6 +3,8 @@ import logging from mcp.server.fastmcp import Context, FastMCP from nextcloud_mcp_server.client import NextcloudClient +from nextcloud_mcp_server.utils.document_parser import is_parseable_document, parse_document +from nextcloud_mcp_server.config import is_unstructured_parsing_enabled logger = logging.getLogger(__name__) @@ -37,14 +39,21 @@ def configure_webdav_tools(mcp: FastMCP): path: Full path to the file to read Returns: - Dict with path, content, content_type, size, and encoding (if binary) - Text files are decoded to UTF-8, binary files are base64 encoded + Dict with path, content, content_type, size, and optional parsing metadata + - Text files are decoded to UTF-8 + - Documents (PDF, DOCX, etc.) are parsed and text is extracted + - Other binary files are base64 encoded Examples: # Read a text file result = await nc_webdav_read_file("Documents/readme.txt") logger.info(result['content']) # Decoded text content + # Read a PDF document (automatically parsed) + result = await nc_webdav_read_file("Documents/report.pdf") + logger.info(result['content']) # Extracted text from PDF + logger.info(result['parsing_metadata']) # Document parsing info + # Read a binary file result = await nc_webdav_read_file("Images/photo.jpg") logger.info(result['encoding']) # 'base64' @@ -52,6 +61,27 @@ def configure_webdav_tools(mcp: FastMCP): client: NextcloudClient = ctx.request_context.lifespan_context.client content, content_type = await client.webdav.read_file(path) + # Check if this is a parseable document (PDF, DOCX, etc.) + if (is_unstructured_parsing_enabled() and is_parseable_document(content_type)): + try: + logger.info(f"Parsing document '{path}' of type '{content_type}'") + parsed_text, metadata = await parse_document( + content, content_type, filename=path + ) + return { + "path": path, + "content": parsed_text, + "content_type": content_type, + "size": len(content), + "parsed": True, + "parsing_metadata": metadata, + } + except Exception as e: + logger.warning( + f"Failed to parse document '{path}', falling back to base64: {e}" + ) + # Fall through to base64 encoding on parse failure + # For text files, decode content for easier viewing if content_type and content_type.startswith("text/"): try: diff --git a/nextcloud_mcp_server/utils/__init__.py b/nextcloud_mcp_server/utils/__init__.py new file mode 100644 index 0000000..816a761 --- /dev/null +++ b/nextcloud_mcp_server/utils/__init__.py @@ -0,0 +1 @@ +"""Utility functions for the Nextcloud MCP server.""" \ No newline at end of file diff --git a/nextcloud_mcp_server/utils/document_parser.py b/nextcloud_mcp_server/utils/document_parser.py new file mode 100644 index 0000000..69674a5 --- /dev/null +++ b/nextcloud_mcp_server/utils/document_parser.py @@ -0,0 +1,120 @@ +"""Document parsing utilities based on the "unstructured" microservice""" + +import logging +from typing import Optional, Tuple + +from nextcloud_mcp_server.config import is_unstructured_parsing_enabled + +logger = logging.getLogger(__name__) + +# Mapping of MIME types to their corresponding parsing strategies +PARSEABLE_MIME_TYPES = { + # PDF documents + "application/pdf": "pdf", + # Microsoft Word documents + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", + "application/msword": "doc", + # Microsoft PowerPoint + "application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx", + "application/vnd.ms-powerpoint": "ppt", + # Microsoft Excel + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx", + "application/vnd.ms-excel": "xls", + # Other document formats + "application/rtf": "rtf", + "text/rtf": "rtf", + "application/vnd.oasis.opendocument.text": "odt", + "application/epub+zip": "epub", + # Email formats + "message/rfc822": "eml", + "application/vnd.ms-outlook": "msg", + # Image formats (for OCR) + "image/jpeg": "image", + "image/png": "image", + "image/tiff": "image", + "image/bmp": "image", +} + +def is_parseable_document(content_type: Optional[str]) -> bool: + """Check if a document type can be parsed. + + Args: + content_type: The MIME type of the document + + Returns: + True if the document can be parsed, False otherwise + """ + if not content_type: + return False + + # Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8") + base_content_type = content_type.split(";")[0].strip().lower() + return base_content_type in PARSEABLE_MIME_TYPES + +async def parse_document( + content: bytes, + content_type: Optional[str], + filename: Optional[str] = None +) -> Tuple[str, dict]: + """Parse a document using the Unstructured API. + + Args: + content: The document content as bytes + content_type: The MIME type of the document + filename: Optional filename to help with format detection + + Returns: + Tuple of (parsed_text, metadata) where: + - parsed_text: The extracted text content + - metadata: Additional metadata about the parsing + + Raises: + ValueError: If the document type is not supported + Exception: If parsing fails + """ + if not is_parseable_document(content_type): + raise ValueError(f"Document type '{content_type}' is not supported for parsing") + + base_content_type = content_type.split(";")[0].strip().lower() if content_type else "" + doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown") + + logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})") + + # Check if unstructured parsing is enabled via environment + if is_unstructured_parsing_enabled(): + logger.debug("Using Unstructured API for parsing") + try: + from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient + client = UnstructuredClient() + return await client.partition_document( + content=content, + filename=filename or f"document.{doc_type}", + content_type=content_type, + strategy="auto" + ) + except Exception as e: + logger.error(f"Unstructured API parsing failed: {e}") + # If unstructured parsing fails, return base64 as fallback + import base64 + parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..." + metadata = { + "document_type": doc_type, + "mime_type": content_type, + "element_count": 0, + "text_length": len(parsed_text), + "parsing_method": "fallback_base64", + "error": str(e) + } + return parsed_text, metadata + else: + logger.debug("Unstructured parsing is disabled, returning base64 encoded content as fallback") + import base64 + parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..." + metadata = { + "document_type": doc_type, + "mime_type": content_type, + "element_count": 0, + "text_length": len(parsed_text), + "parsing_method": "fallback_base64" + } + return parsed_text, metadata \ No newline at end of file