ran ruff format via uv

This commit is contained in:
yuisheaven
2025-10-05 02:16:42 +02:00
parent c9a687171a
commit 3ff6346c03
5 changed files with 87 additions and 67 deletions
@@ -17,31 +17,31 @@ logger = logging.getLogger(__name__)
class UnstructuredClient:
"""Client for interacting with the Unstructured API.
The Unstructured API provides document parsing capabilities for various formats
including PDF, DOCX, images with OCR, and more.
API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters
"""
def __init__(self, api_url: Optional[str] = None, timeout: int = 120):
"""Initialize the Unstructured API client.
Args:
api_url: Base URL of the Unstructured API. If None, will use config.
timeout: Request timeout in seconds (default: 120 for large documents)
"""
self.api_url = api_url or get_unstructured_api_url()
self.timeout = timeout
if not self.api_url:
raise ValueError(
"Unstructured API URL not configured. "
"Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment."
)
logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}")
async def partition_document(
self,
content: bytes,
@@ -52,7 +52,7 @@ class UnstructuredClient:
extract_image_block_types: Optional[list[str]] = None,
) -> Tuple[str, dict]:
"""Parse a document using the Unstructured API.
Args:
content: The document content as bytes
filename: The filename (used for format detection)
@@ -62,12 +62,12 @@ class UnstructuredClient:
languages: List of language codes for OCR (e.g., ["eng", "deu"]).
If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
extract_image_block_types: Types of elements to extract from images
Returns:
Tuple of (parsed_text, metadata) where:
- parsed_text: The extracted text content
- metadata: Additional metadata about the parsing
Raises:
httpx.HTTPError: If the API request fails
Exception: If parsing fails
@@ -75,29 +75,33 @@ class UnstructuredClient:
# Use environment configuration as defaults
if strategy is None:
strategy = get_unstructured_strategy()
if languages is None:
languages = get_unstructured_languages()
# Prepare the multipart form data
files = {
"files": (filename, io.BytesIO(content), content_type or "application/octet-stream")
"files": (
filename,
io.BytesIO(content),
content_type or "application/octet-stream",
)
}
# Prepare the request data
data = {
"strategy": strategy,
"languages": ",".join(languages),
}
if extract_image_block_types:
data["extract_image_block_types"] = ",".join(extract_image_block_types)
logger.debug(
f"Partitioning document '{filename}' with strategy '{strategy}', "
f"languages: {languages}"
)
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.post(
@@ -106,25 +110,25 @@ class UnstructuredClient:
data=data,
)
response.raise_for_status()
# Parse the response
elements = response.json()
# Extract text from elements
# Each element has a "text" field
texts = []
element_types = {}
for element in elements:
if "text" in element and element["text"]:
texts.append(element["text"])
# Track element types
el_type = element.get("type", "unknown")
element_types[el_type] = element_types.get(el_type, 0) + 1
parsed_text = "\n\n".join(texts)
# Collect metadata
metadata = {
"element_count": len(elements),
@@ -132,26 +136,28 @@ class UnstructuredClient:
"element_types": element_types,
"strategy": strategy,
"languages": languages,
"parsing_method": "unstructured_api"
"parsing_method": "unstructured_api",
}
logger.debug(
f"Successfully parsed document: {len(elements)} elements, "
f"{len(parsed_text)} characters"
)
return parsed_text, metadata
except httpx.HTTPError as e:
logger.error(f"HTTP error calling Unstructured API: {e}")
raise Exception(f"Failed to parse document via Unstructured API: {str(e)}") from e
raise Exception(
f"Failed to parse document via Unstructured API: {str(e)}"
) from e
except Exception as e:
logger.error(f"Unexpected error parsing document: {e}")
raise Exception(f"Failed to parse document: {str(e)}") from e
async def health_check(self) -> bool:
"""Check if the Unstructured API is available.
Returns:
True if the API is healthy, False otherwise.
"""
@@ -161,4 +167,4 @@ class UnstructuredClient:
return response.status_code == 200
except Exception as e:
logger.warning(f"Unstructured API health check failed: {e}")
return False
return False
+17 -15
View File
@@ -42,20 +42,20 @@ def setup_logging():
# Document Parsing Configuration
def get_unstructured_api_url() -> Optional[str]:
"""Get the Unstructured API URL from environment variables.
Returns:
The Unstructured API URL if parsing is enabled, None otherwise.
"""
enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
if not enabled:
return None
return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000")
def is_unstructured_parsing_enabled() -> bool:
"""Check if unstructured document parsing is enabled.
Returns:
True if enabled, False otherwise.
"""
@@ -64,36 +64,36 @@ def is_unstructured_parsing_enabled() -> bool:
def get_unstructured_strategy() -> str:
"""Get the parsing strategy for the Unstructured API.
Valid values are:
- 'auto': Automatically choose the best strategy (default)
- 'fast': Fast parsing without OCR
- 'hi_res': High-resolution parsing with OCR for better accuracy
Returns:
The parsing strategy to use.
"""
strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
valid_strategies = ["auto", "fast", "hi_res"]
if strategy not in valid_strategies:
logging.warning(
f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
f"Valid options: {', '.join(valid_strategies)}"
)
return "hi_res"
return strategy
def get_unstructured_languages() -> list[str]:
"""Get the OCR languages for the Unstructured API.
Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
Multiple languages can be specified separated by commas.
Default languages: English (eng) and German (deu)
Common language codes:
- eng: English
- deu: German
@@ -106,17 +106,19 @@ def get_unstructured_languages() -> list[str]:
- zho: Chinese
- jpn: Japanese
- kor: Korean
Returns:
List of language codes for OCR processing.
"""
languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
# Split by comma and clean up whitespace
languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
if not languages:
logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu")
logging.warning(
"No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu"
)
return ["eng", "deu"]
return languages
+5 -2
View File
@@ -3,7 +3,10 @@ import logging
from mcp.server.fastmcp import Context, FastMCP
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.utils.document_parser import is_parseable_document, parse_document
from nextcloud_mcp_server.utils.document_parser import (
is_parseable_document,
parse_document,
)
from nextcloud_mcp_server.config import is_unstructured_parsing_enabled
logger = logging.getLogger(__name__)
@@ -62,7 +65,7 @@ def configure_webdav_tools(mcp: FastMCP):
content, content_type = await client.webdav.read_file(path)
# Check if this is a parseable document (PDF, DOCX, etc.)
if (is_unstructured_parsing_enabled() and is_parseable_document(content_type)):
if is_unstructured_parsing_enabled() and is_parseable_document(content_type):
try:
logger.info(f"Parsing document '{path}' of type '{content_type}'")
parsed_text, metadata = await parse_document(
+1 -1
View File
@@ -1 +1 @@
"""Utility functions for the Nextcloud MCP server."""
"""Utility functions for the Nextcloud MCP server."""
+27 -18
View File
@@ -35,56 +35,61 @@ PARSEABLE_MIME_TYPES = {
"image/bmp": "image",
}
def is_parseable_document(content_type: Optional[str]) -> bool:
"""Check if a document type can be parsed.
Args:
content_type: The MIME type of the document
Returns:
True if the document can be parsed, False otherwise
"""
if not content_type:
return False
# Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8")
base_content_type = content_type.split(";")[0].strip().lower()
return base_content_type in PARSEABLE_MIME_TYPES
async def parse_document(
content: bytes,
content_type: Optional[str],
filename: Optional[str] = None
content: bytes, content_type: Optional[str], filename: Optional[str] = None
) -> Tuple[str, dict]:
"""Parse a document using the Unstructured API.
Args:
content: The document content as bytes
content_type: The MIME type of the document
filename: Optional filename to help with format detection
Returns:
Tuple of (parsed_text, metadata) where:
- parsed_text: The extracted text content
- metadata: Additional metadata about the parsing
Raises:
ValueError: If the document type is not supported
Exception: If parsing fails
"""
if not is_parseable_document(content_type):
raise ValueError(f"Document type '{content_type}' is not supported for parsing")
base_content_type = content_type.split(";")[0].strip().lower() if content_type else ""
base_content_type = (
content_type.split(";")[0].strip().lower() if content_type else ""
)
doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown")
logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})")
# Check if unstructured parsing is enabled via environment
if is_unstructured_parsing_enabled():
logger.debug("Using Unstructured API for parsing")
try:
from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
from nextcloud_mcp_server.client.unstructured_client import (
UnstructuredClient,
)
client = UnstructuredClient()
# The client will automatically use environment configuration
# (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
@@ -97,6 +102,7 @@ async def parse_document(
logger.error(f"Unstructured API parsing failed: {e}")
# If unstructured parsing fails, return base64 as fallback
import base64
parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
metadata = {
"document_type": doc_type,
@@ -104,18 +110,21 @@ async def parse_document(
"element_count": 0,
"text_length": len(parsed_text),
"parsing_method": "fallback_base64",
"error": str(e)
"error": str(e),
}
return parsed_text, metadata
else:
logger.debug("Unstructured parsing is disabled, returning base64 encoded content as fallback")
logger.debug(
"Unstructured parsing is disabled, returning base64 encoded content as fallback"
)
import base64
parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
metadata = {
"document_type": doc_type,
"mime_type": content_type,
"element_count": 0,
"text_length": len(parsed_text),
"parsing_method": "fallback_base64"
"parsing_method": "fallback_base64",
}
return parsed_text, metadata
return parsed_text, metadata