Merge pull request #190 from yuisheaven/feature/introduce_files_parsing_with_unstructured_service_for_webdav_files_retrieval

Introduce files parsing with "unstructured" service for webdav files retrieval
This commit is contained in:
Chris Coutinho
2025-10-25 19:11:27 +02:00
committed by GitHub
12 changed files with 1767 additions and 949 deletions
+2
View File
@@ -38,6 +38,8 @@ uv run pytest -m integration -v
uv run pytest -m "not integration" -v
```
! Hint: If the tests are failing due to missing environment variables, then usually the correct .env has not been created or not correctly configured yet.
### Load Testing
```bash
# Run benchmark with default settings (10 workers, 30 seconds)
+15
View File
@@ -51,18 +51,33 @@ services:
- ./tests/fixtures/test_recipe.html:/usr/share/nginx/html/test_recipe.html:ro
- ./tests/fixtures/nginx.conf:/etc/nginx/nginx.conf:ro
unstructured:
image: downloads.unstructured.io/unstructured-io/unstructured-api:latest
restart: always
ports:
- 127.0.0.1:8002:8000
# Unstructured API runs on port 8000 internally
# We expose it on 8002 externally to avoid conflict
mcp:
build: .
command: ["--transport", "streamable-http"]
restart: always
depends_on:
- app
- unstructured
ports:
- 127.0.0.1:8000:8000
environment:
- NEXTCLOUD_HOST=http://app:80
- NEXTCLOUD_USERNAME=admin
- NEXTCLOUD_PASSWORD=admin
- ENABLE_UNSTRUCTURED_PARSING=true
- UNSTRUCTURED_API_URL=http://unstructured:8000
- UNSTRUCTURED_STRATEGY=hi_res
- UNSTRUCTURED_LANGUAGES=deu,eng
#volumes:
#- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro
mcp-oauth:
build: .
+24
View File
@@ -21,3 +21,27 @@ NEXTCLOUD_MCP_SERVER_URL=http://localhost:8000
# - If these are set, OAuth mode is disabled
NEXTCLOUD_USERNAME=
NEXTCLOUD_PASSWORD=
# Document Parsing Configuration
# Enable/disable unstructured parsing for documents (PDF, DOCX, etc.)
ENABLE_UNSTRUCTURED_PARSING=true
# Unstructured API endpoint (default for docker-compose setup)
UNSTRUCTURED_API_URL=http://unstructured:8000
# Parsing strategy for the Unstructured service
# Valid values: auto, fast, hi_res
# - auto: Automatically choose the best strategy based on document type
# - fast: Fast parsing without OCR - best for simple text documents
# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default)
UNSTRUCTURED_STRATEGY=hi_res
# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes)
# Default: eng,deu (English and German)
# Common language codes:
# eng = English deu = German fra = French
# spa = Spanish ita = Italian por = Portuguese
# rus = Russian ara = Arabic zho = Chinese
# jpn = Japanese kor = Korean
# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra
UNSTRUCTURED_LANGUAGES=eng,deu
@@ -0,0 +1,170 @@
"""HTTP client for Unstructured API."""
import io
import logging
from typing import Optional, Tuple
import httpx
from nextcloud_mcp_server.config import (
get_unstructured_api_url,
get_unstructured_languages,
get_unstructured_strategy,
)
logger = logging.getLogger(__name__)
class UnstructuredClient:
"""Client for interacting with the Unstructured API.
The Unstructured API provides document parsing capabilities for various formats
including PDF, DOCX, images with OCR, and more.
API Documentation: https://docs.unstructured.io/api-reference/api-services/api-parameters
"""
def __init__(self, api_url: Optional[str] = None, timeout: int = 120):
"""Initialize the Unstructured API client.
Args:
api_url: Base URL of the Unstructured API. If None, will use config.
timeout: Request timeout in seconds (default: 120 for large documents)
"""
self.api_url = api_url or get_unstructured_api_url()
self.timeout = timeout
if not self.api_url:
raise ValueError(
"Unstructured API URL not configured. "
"Set ENABLE_UNSTRUCTURED_PARSING=true and UNSTRUCTURED_API_URL in environment."
)
logger.info(f"Initialized UnstructuredClient with API URL: {self.api_url}")
async def partition_document(
self,
content: bytes,
filename: str,
content_type: Optional[str] = None,
strategy: Optional[str] = None,
languages: Optional[list[str]] = None,
extract_image_block_types: Optional[list[str]] = None,
) -> Tuple[str, dict]:
"""Parse a document using the Unstructured API.
Args:
content: The document content as bytes
filename: The filename (used for format detection)
content_type: Optional MIME type
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
If None, uses the value from UNSTRUCTURED_STRATEGY env var.
languages: List of language codes for OCR (e.g., ["eng", "deu"]).
If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
extract_image_block_types: Types of elements to extract from images
Returns:
Tuple of (parsed_text, metadata) where:
- parsed_text: The extracted text content
- metadata: Additional metadata about the parsing
Raises:
httpx.HTTPError: If the API request fails
Exception: If parsing fails
"""
# Use environment configuration as defaults
if strategy is None:
strategy = get_unstructured_strategy()
if languages is None:
languages = get_unstructured_languages()
# Prepare the multipart form data
files = {
"files": (
filename,
io.BytesIO(content),
content_type or "application/octet-stream",
)
}
# Prepare the request data
data = {
"strategy": strategy,
"languages": ",".join(languages),
}
if extract_image_block_types:
data["extract_image_block_types"] = ",".join(extract_image_block_types)
logger.debug(
f"Partitioning document '{filename}' with strategy '{strategy}', "
f"languages: {languages}"
)
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
response = await client.post(
f"{self.api_url}/general/v0/general",
files=files,
data=data,
)
response.raise_for_status()
# Parse the response
elements = response.json()
# Extract text from elements
# Each element has a "text" field
texts = []
element_types = {}
for element in elements:
if "text" in element and element["text"]:
texts.append(element["text"])
# Track element types
el_type = element.get("type", "unknown")
element_types[el_type] = element_types.get(el_type, 0) + 1
parsed_text = "\n\n".join(texts)
# Collect metadata
metadata = {
"element_count": len(elements),
"text_length": len(parsed_text),
"element_types": element_types,
"strategy": strategy,
"languages": languages,
"parsing_method": "unstructured_api",
}
logger.debug(
f"Successfully parsed document: {len(elements)} elements, "
f"{len(parsed_text)} characters"
)
return parsed_text, metadata
except httpx.HTTPError as e:
logger.error(f"HTTP error calling Unstructured API: {e}")
raise Exception(
f"Failed to parse document via Unstructured API: {str(e)}"
) from e
except Exception as e:
logger.error(f"Unexpected error parsing document: {e}")
raise Exception(f"Failed to parse document: {str(e)}") from e
async def health_check(self) -> bool:
"""Check if the Unstructured API is available.
Returns:
True if the API is healthy, False otherwise.
"""
try:
async with httpx.AsyncClient(timeout=5) as client:
response = await client.get(f"{self.api_url}/healthcheck")
return response.status_code == 200
except Exception as e:
logger.warning(f"Unstructured API health check failed: {e}")
return False
+87
View File
@@ -1,4 +1,6 @@
import logging.config
import os
from typing import Optional
LOGGING_CONFIG = {
"version": 1,
@@ -51,3 +53,88 @@ LOGGING_CONFIG = {
def setup_logging():
logging.config.dictConfig(LOGGING_CONFIG)
# Document Parsing Configuration
def get_unstructured_api_url() -> Optional[str]:
"""Get the Unstructured API URL from environment variables.
Returns:
The Unstructured API URL if parsing is enabled, None otherwise.
"""
enabled = os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
if not enabled:
return None
return os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000")
def is_unstructured_parsing_enabled() -> bool:
"""Check if unstructured document parsing is enabled.
Returns:
True if enabled, False otherwise.
"""
return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
def get_unstructured_strategy() -> str:
"""Get the parsing strategy for the Unstructured API.
Valid values are:
- 'auto': Automatically choose the best strategy (default)
- 'fast': Fast parsing without OCR
- 'hi_res': High-resolution parsing with OCR for better accuracy
Returns:
The parsing strategy to use.
"""
strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
valid_strategies = ["auto", "fast", "hi_res"]
if strategy not in valid_strategies:
logging.warning(
f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
f"Valid options: {', '.join(valid_strategies)}"
)
return "hi_res"
return strategy
def get_unstructured_languages() -> list[str]:
"""Get the OCR languages for the Unstructured API.
Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
Multiple languages can be specified separated by commas.
Default languages: English (eng) and German (deu)
Common language codes:
- eng: English
- deu: German
- fra: French
- spa: Spanish
- ita: Italian
- por: Portuguese
- rus: Russian
- ara: Arabic
- zho: Chinese
- jpn: Japanese
- kor: Korean
Returns:
List of language codes for OCR processing.
"""
languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
# Split by comma and clean up whitespace
languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
if not languages:
logging.warning(
"No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu"
)
return ["eng", "deu"]
return languages
+45 -2
View File
@@ -2,6 +2,12 @@ import logging
from mcp.server.fastmcp import Context, FastMCP
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.utils.document_parser import (
is_parseable_document,
parse_document,
)
from nextcloud_mcp_server.config import is_unstructured_parsing_enabled
from nextcloud_mcp_server.auth import require_scopes
from nextcloud_mcp_server.context import get_client
from nextcloud_mcp_server.models import DirectoryListing, FileInfo, SearchFilesResponse
@@ -53,12 +59,49 @@ def configure_webdav_tools(mcp: FastMCP):
path: Full path to the file to read
Returns:
Dict with path, content, content_type, size, and encoding (if binary)
Text files are decoded to UTF-8, binary files are base64 encoded
Dict with path, content, content_type, size, and optional parsing metadata
- Text files are decoded to UTF-8
- Documents (PDF, DOCX, etc.) are parsed and text is extracted
- Other binary files are base64 encoded
Examples:
# Read a text file
result = await nc_webdav_read_file("Documents/readme.txt")
logger.info(result['content']) # Decoded text content
# Read a PDF document (automatically parsed)
result = await nc_webdav_read_file("Documents/report.pdf")
logger.info(result['content']) # Extracted text from PDF
logger.info(result['parsing_metadata']) # Document parsing info
# Read a binary file
result = await nc_webdav_read_file("Images/photo.jpg")
logger.info(result['encoding']) # 'base64'
"""
client = get_client(ctx)
content, content_type = await client.webdav.read_file(path)
# Check if this is a parseable document (PDF, DOCX, etc.)
if is_unstructured_parsing_enabled() and is_parseable_document(content_type):
try:
logger.info(f"Parsing document '{path}' of type '{content_type}'")
parsed_text, metadata = await parse_document(
content, content_type, filename=path
)
return {
"path": path,
"content": parsed_text,
"content_type": content_type,
"size": len(content),
"parsed": True,
"parsing_metadata": metadata,
}
except Exception as e:
logger.warning(
f"Failed to parse document '{path}', falling back to base64: {e}"
)
# Fall through to base64 encoding on parse failure
# For text files, decode content for easier viewing
if content_type and content_type.startswith("text/"):
try:
+1
View File
@@ -0,0 +1 @@
"""Utility functions for the Nextcloud MCP server."""
@@ -0,0 +1,130 @@
"""Document parsing utilities based on the "unstructured" microservice"""
import logging
from typing import Optional, Tuple
from nextcloud_mcp_server.config import is_unstructured_parsing_enabled
logger = logging.getLogger(__name__)
# Mapping of MIME types to their corresponding parsing strategies
PARSEABLE_MIME_TYPES = {
# PDF documents
"application/pdf": "pdf",
# Microsoft Word documents
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"application/msword": "doc",
# Microsoft PowerPoint
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
"application/vnd.ms-powerpoint": "ppt",
# Microsoft Excel
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
"application/vnd.ms-excel": "xls",
# Other document formats
"application/rtf": "rtf",
"text/rtf": "rtf",
"application/vnd.oasis.opendocument.text": "odt",
"application/epub+zip": "epub",
# Email formats
"message/rfc822": "eml",
"application/vnd.ms-outlook": "msg",
# Image formats (for OCR)
"image/jpeg": "image",
"image/png": "image",
"image/tiff": "image",
"image/bmp": "image",
}
def is_parseable_document(content_type: Optional[str]) -> bool:
"""Check if a document type can be parsed.
Args:
content_type: The MIME type of the document
Returns:
True if the document can be parsed, False otherwise
"""
if not content_type:
return False
# Handle content types with additional parameters (e.g., "application/pdf; charset=utf-8")
base_content_type = content_type.split(";")[0].strip().lower()
return base_content_type in PARSEABLE_MIME_TYPES
async def parse_document(
content: bytes, content_type: Optional[str], filename: Optional[str] = None
) -> Tuple[str, dict]:
"""Parse a document using the Unstructured API.
Args:
content: The document content as bytes
content_type: The MIME type of the document
filename: Optional filename to help with format detection
Returns:
Tuple of (parsed_text, metadata) where:
- parsed_text: The extracted text content
- metadata: Additional metadata about the parsing
Raises:
ValueError: If the document type is not supported
Exception: If parsing fails
"""
if not is_parseable_document(content_type):
raise ValueError(f"Document type '{content_type}' is not supported for parsing")
base_content_type = (
content_type.split(";")[0].strip().lower() if content_type else ""
)
doc_type = PARSEABLE_MIME_TYPES.get(base_content_type, "unknown")
logger.debug(f"Parsing document of type '{doc_type}' (MIME: {content_type})")
# Check if unstructured parsing is enabled via environment
if is_unstructured_parsing_enabled():
logger.debug("Using Unstructured API for parsing")
try:
from nextcloud_mcp_server.client.unstructured_client import (
UnstructuredClient,
)
client = UnstructuredClient()
# The client will automatically use environment configuration
# (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
return await client.partition_document(
content=content,
filename=filename or f"document.{doc_type}",
content_type=content_type,
)
except Exception as e:
logger.error(f"Unstructured API parsing failed: {e}")
# If unstructured parsing fails, return base64 as fallback
import base64
parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
metadata = {
"document_type": doc_type,
"mime_type": content_type,
"element_count": 0,
"text_length": len(parsed_text),
"parsing_method": "fallback_base64",
"error": str(e),
}
return parsed_text, metadata
else:
logger.debug(
"Unstructured parsing is disabled, returning base64 encoded content as fallback"
)
import base64
parsed_text = f"Document could not be parsed. Base64 content: {base64.b64encode(content).decode('ascii')[:200]}..."
metadata = {
"document_type": doc_type,
"mime_type": content_type,
"element_count": 0,
"text_length": len(parsed_text),
"parsing_method": "fallback_base64",
}
return parsed_text, metadata
+1
View File
@@ -91,6 +91,7 @@ dev = [
"pytest-playwright-asyncio>=0.7.1",
"pytest-timeout>=2.3.1",
"ruff>=0.11.13",
"reportlab>=4.0.0",
]
[project.scripts]
+148
View File
@@ -0,0 +1,148 @@
"""Integration tests for Unstructured API functionality."""
import json
import logging
import uuid
from io import BytesIO
import pytest
from mcp.client.session import ClientSession
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from nextcloud_mcp_server.client import NextcloudClient
logger = logging.getLogger(__name__)
@pytest.fixture
async def test_base_path(nc_client: NextcloudClient):
"""Base path for test files/directories."""
test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
await nc_client.webdav.create_directory(test_dir)
yield test_dir
try:
await nc_client.webdav.delete_resource(test_dir)
except Exception:
pass # Ignore cleanup errors
def create_test_pdf(text: str) -> bytes:
"""Create a simple PDF document with the given text."""
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
c.drawString(100, 750, text)
c.save()
buffer.seek(0)
return buffer.getvalue()
@pytest.mark.integration
async def test_unstructured_api_enabled_parsing(
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
):
"""Test that documents are parsed using the Unstructured API when enabled."""
test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
test_text = "This is a test PDF document for Unstructured API parsing"
try:
# Create a simple PDF
pdf_content = create_test_pdf(test_text)
# Upload the PDF
await nc_client.webdav.write_file(
test_file, pdf_content, content_type="application/pdf"
)
logger.info(f"Uploaded PDF file: {test_file}")
# Read the PDF using MCP tool (should parse via Unstructured API)
mcp_result = await nc_mcp_client.call_tool(
"nc_webdav_read_file", arguments={"path": test_file}
)
# Extract content from the MCP result
if hasattr(mcp_result.content[0], "text"):
result_text = mcp_result.content[0].text
else:
# Fallback for other content types
result_text = str(mcp_result.content[0])
# Parse the JSON response
result = json.loads(result_text)
# Verify the result structure
assert "path" in result
assert "content" in result
assert "content_type" in result
assert "parsed" in result # Should be present when parsing succeeds
# The content should be readable text, not base64
content = result["content"]
assert isinstance(content, str)
assert len(content) > 0
assert "test" in content.lower() # Should contain our test text
# Should have parsing metadata
assert "parsing_metadata" in result
parsing_metadata = result["parsing_metadata"]
assert parsing_metadata["parsing_method"] == "unstructured_api"
logger.info("Successfully parsed PDF using Unstructured API")
finally:
# Clean up
try:
await nc_client.webdav.delete_resource(test_file)
except Exception:
pass # Ignore cleanup errors
@pytest.mark.integration
async def test_unstructured_api_with_docx(
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
):
"""Test Unstructured API with DOCX files."""
test_file = f"{test_base_path}/test_unstructured_docx.docx"
try:
# Create a simple DOCX-like file for testing purposes
# Since we're removing python-docx dependency, we'll create a simple file
docx_content = (
b"This is a mock DOCX file content for testing Unstructured API parsing"
)
# Upload the file
await nc_client.webdav.write_file(
test_file,
docx_content,
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
logger.info(f"Uploaded DOCX file: {test_file}")
# Read the file using MCP tool
mcp_result = await nc_mcp_client.call_tool(
"nc_webdav_read_file", arguments={"path": test_file}
)
# Extract content from the MCP result
if hasattr(mcp_result.content[0], "text"):
result_text = mcp_result.content[0].text
else:
# Fallback for other content types
result_text = str(mcp_result.content[0])
# Parse the JSON response
result = json.loads(result_text)
# Verify the result structure
assert "path" in result
assert "content" in result
assert "content_type" in result
logger.info("Successfully processed DOCX file with Unstructured API")
finally:
# Clean up
try:
await nc_client.webdav.delete_resource(test_file)
except Exception:
pass # Ignore cleanup errors
+172
View File
@@ -0,0 +1,172 @@
"""Unit tests for Unstructured API configuration."""
import os
import pytest
from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
from nextcloud_mcp_server.config import (
get_unstructured_languages,
get_unstructured_strategy,
)
class TestUnstructuredStrategy:
"""Test strategy configuration."""
def test_strategy_default(self):
"""Test that strategy defaults to 'auto'."""
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
assert get_unstructured_strategy() == "auto"
def test_strategy_custom_auto(self):
"""Test custom strategy 'auto'."""
os.environ["UNSTRUCTURED_STRATEGY"] = "auto"
try:
assert get_unstructured_strategy() == "auto"
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
def test_strategy_custom_fast(self):
"""Test custom strategy 'fast'."""
os.environ["UNSTRUCTURED_STRATEGY"] = "fast"
try:
assert get_unstructured_strategy() == "fast"
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
def test_strategy_custom_hi_res(self):
"""Test custom strategy 'hi_res'."""
os.environ["UNSTRUCTURED_STRATEGY"] = "hi_res"
try:
assert get_unstructured_strategy() == "hi_res"
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
def test_strategy_invalid_fallback(self, caplog):
"""Test that invalid strategy falls back to 'hi_res'."""
import logging
os.environ["UNSTRUCTURED_STRATEGY"] = "invalid_strategy"
try:
# Ensure logging is captured at WARNING level
with caplog.at_level(logging.WARNING):
strategy = get_unstructured_strategy()
assert strategy == "hi_res"
assert "Invalid UNSTRUCTURED_STRATEGY" in caplog.text
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
def test_strategy_case_insensitive(self):
"""Test that strategy is case-insensitive."""
os.environ["UNSTRUCTURED_STRATEGY"] = "HI_RES"
try:
assert get_unstructured_strategy() == "hi_res"
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
class TestUnstructuredLanguages:
"""Test language configuration."""
def test_languages_default(self):
"""Test that languages default to English and German."""
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
assert get_unstructured_languages() == ["eng", "deu"]
def test_languages_single(self):
"""Test single language configuration."""
os.environ["UNSTRUCTURED_LANGUAGES"] = "eng"
try:
assert get_unstructured_languages() == ["eng"]
finally:
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
def test_languages_multiple(self):
"""Test multiple languages configuration."""
os.environ["UNSTRUCTURED_LANGUAGES"] = "eng,fra,spa"
try:
assert get_unstructured_languages() == ["eng", "fra", "spa"]
finally:
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
def test_languages_whitespace_trimming(self):
"""Test that whitespace is trimmed from language codes."""
os.environ["UNSTRUCTURED_LANGUAGES"] = "eng, deu , fra "
try:
assert get_unstructured_languages() == ["eng", "deu", "fra"]
finally:
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
def test_languages_empty_fallback(self, caplog):
"""Test that empty languages string falls back to default."""
import logging
os.environ["UNSTRUCTURED_LANGUAGES"] = ""
try:
with caplog.at_level(logging.WARNING):
languages = get_unstructured_languages()
assert languages == ["eng", "deu"]
assert "No languages specified" in caplog.text
finally:
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
def test_languages_only_whitespace_fallback(self, caplog):
"""Test that whitespace-only string falls back to default."""
import logging
os.environ["UNSTRUCTURED_LANGUAGES"] = " , , "
try:
with caplog.at_level(logging.WARNING):
languages = get_unstructured_languages()
assert languages == ["eng", "deu"]
assert "No languages specified" in caplog.text
finally:
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
class TestUnstructuredClientConfiguration:
"""Test that UnstructuredClient respects configuration."""
@pytest.mark.asyncio
async def test_client_uses_default_strategy(self):
"""Test that client uses default strategy from environment."""
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
try:
_client = UnstructuredClient()
# The partition_document method should use get_unstructured_strategy() when strategy is None
# We can't test the actual call without a running API, but we can verify the config is read
assert get_unstructured_strategy() == "auto"
finally:
os.environ.pop("UNSTRUCTURED_API_URL", None)
@pytest.mark.asyncio
async def test_client_uses_default_languages(self):
"""Test that client uses default languages from environment."""
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
try:
_client = UnstructuredClient()
# The partition_document method should use get_unstructured_languages() when languages is None
assert get_unstructured_languages() == ["eng", "deu"]
finally:
os.environ.pop("UNSTRUCTURED_API_URL", None)
@pytest.mark.asyncio
async def test_client_uses_custom_configuration(self):
"""Test that client uses custom configuration from environment."""
os.environ["UNSTRUCTURED_STRATEGY"] = "hi_res"
os.environ["UNSTRUCTURED_LANGUAGES"] = "eng,fra,spa"
os.environ["UNSTRUCTURED_API_URL"] = "http://test:8000"
try:
_client = UnstructuredClient()
assert get_unstructured_strategy() == "hi_res"
assert get_unstructured_languages() == ["eng", "fra", "spa"]
finally:
os.environ.pop("UNSTRUCTURED_STRATEGY", None)
os.environ.pop("UNSTRUCTURED_LANGUAGES", None)
os.environ.pop("UNSTRUCTURED_API_URL", None)
Generated
+972 -947
View File
File diff suppressed because it is too large Load Diff