156 lines
5.0 KiB
Python
156 lines
5.0 KiB
Python
"""Integration tests for Unstructured API functionality."""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import uuid
|
|
from io import BytesIO
|
|
|
|
import pytest
|
|
from mcp.client.session import ClientSession
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.pdfgen import canvas
|
|
|
|
from nextcloud_mcp_server.client import NextcloudClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@pytest.fixture
|
|
async def test_base_path(nc_client: NextcloudClient):
|
|
"""Base path for test files/directories."""
|
|
test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
|
|
await nc_client.webdav.create_directory(test_dir)
|
|
yield test_dir
|
|
try:
|
|
await nc_client.webdav.delete_resource(test_dir)
|
|
except Exception:
|
|
pass # Ignore cleanup errors
|
|
|
|
|
|
def create_test_pdf(text: str) -> bytes:
|
|
"""Create a simple PDF document with the given text."""
|
|
buffer = BytesIO()
|
|
c = canvas.Canvas(buffer, pagesize=letter)
|
|
c.drawString(100, 750, text)
|
|
c.save()
|
|
buffer.seek(0)
|
|
return buffer.getvalue()
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
condition=os.getenv("ENABLE_UNSTRUCTURED", "false") != "true",
|
|
reason="Unstructured is not enabled",
|
|
)
|
|
async def test_unstructured_api_enabled_parsing(
|
|
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
|
|
):
|
|
"""Test that documents are parsed using the Unstructured API when enabled."""
|
|
test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
|
|
test_text = "This is a test PDF document for Unstructured API parsing"
|
|
|
|
try:
|
|
# Create a simple PDF
|
|
pdf_content = create_test_pdf(test_text)
|
|
|
|
# Upload the PDF
|
|
await nc_client.webdav.write_file(
|
|
test_file, pdf_content, content_type="application/pdf"
|
|
)
|
|
logger.info(f"Uploaded PDF file: {test_file}")
|
|
|
|
# Read the PDF using MCP tool (should parse via Unstructured API)
|
|
mcp_result = await nc_mcp_client.call_tool(
|
|
"nc_webdav_read_file", arguments={"path": test_file}
|
|
)
|
|
|
|
# Extract content from the MCP result
|
|
if hasattr(mcp_result.content[0], "text"):
|
|
result_text = mcp_result.content[0].text
|
|
else:
|
|
# Fallback for other content types
|
|
result_text = str(mcp_result.content[0])
|
|
|
|
# Parse the JSON response
|
|
result = json.loads(result_text)
|
|
|
|
# Verify the result structure
|
|
assert "path" in result
|
|
assert "content" in result
|
|
assert "content_type" in result
|
|
assert "parsed" in result # Should be present when parsing succeeds
|
|
|
|
# The content should be readable text, not base64
|
|
content = result["content"]
|
|
assert isinstance(content, str)
|
|
assert len(content) > 0
|
|
assert "test" in content.lower() # Should contain our test text
|
|
|
|
# Should have parsing metadata
|
|
assert "parsing_metadata" in result
|
|
parsing_metadata = result["parsing_metadata"]
|
|
assert parsing_metadata["parsing_method"] == "unstructured_api"
|
|
|
|
logger.info("Successfully parsed PDF using Unstructured API")
|
|
|
|
finally:
|
|
# Clean up
|
|
try:
|
|
await nc_client.webdav.delete_resource(test_file)
|
|
except Exception:
|
|
pass # Ignore cleanup errors
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
condition=os.getenv("ENABLE_UNSTRUCTURED", "false") != "true",
|
|
reason="Unstructured is not enabled",
|
|
)
|
|
async def test_unstructured_api_with_docx(
|
|
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
|
|
):
|
|
"""Test Unstructured API with DOCX files."""
|
|
test_file = f"{test_base_path}/test_unstructured_docx.docx"
|
|
try:
|
|
# Create a simple DOCX-like file for testing purposes
|
|
# Since we're removing python-docx dependency, we'll create a simple file
|
|
docx_content = (
|
|
b"This is a mock DOCX file content for testing Unstructured API parsing"
|
|
)
|
|
|
|
# Upload the file
|
|
await nc_client.webdav.write_file(
|
|
test_file,
|
|
docx_content,
|
|
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
)
|
|
logger.info(f"Uploaded DOCX file: {test_file}")
|
|
|
|
# Read the file using MCP tool
|
|
mcp_result = await nc_mcp_client.call_tool(
|
|
"nc_webdav_read_file", arguments={"path": test_file}
|
|
)
|
|
|
|
# Extract content from the MCP result
|
|
if hasattr(mcp_result.content[0], "text"):
|
|
result_text = mcp_result.content[0].text
|
|
else:
|
|
# Fallback for other content types
|
|
result_text = str(mcp_result.content[0])
|
|
|
|
# Parse the JSON response
|
|
result = json.loads(result_text)
|
|
|
|
# Verify the result structure
|
|
assert "path" in result
|
|
assert "content" in result
|
|
assert "content_type" in result
|
|
|
|
logger.info("Successfully processed DOCX file with Unstructured API")
|
|
|
|
finally:
|
|
# Clean up
|
|
try:
|
|
await nc_client.webdav.delete_resource(test_file)
|
|
except Exception:
|
|
pass # Ignore cleanup errors
|