added new tests for unstructured api (pdf and docx workflow)
This commit is contained in:
@@ -0,0 +1,150 @@
|
||||
"""Integration tests for Unstructured API functionality."""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, Dict
|
||||
from io import BytesIO
|
||||
|
||||
import pytest
|
||||
from mcp.client.session import ClientSession
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
from nextcloud_mcp_server.client import NextcloudClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def test_base_path(nc_client: NextcloudClient):
|
||||
"""Base path for test files/directories."""
|
||||
test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
|
||||
await nc_client.webdav.create_directory(test_dir)
|
||||
yield test_dir
|
||||
try:
|
||||
await nc_client.webdav.delete_resource(test_dir)
|
||||
except Exception:
|
||||
pass # Ignore cleanup errors
|
||||
|
||||
|
||||
def create_test_pdf(text: str) -> bytes:
|
||||
"""Create a simple PDF document with the given text."""
|
||||
buffer = BytesIO()
|
||||
c = canvas.Canvas(buffer, pagesize=letter)
|
||||
c.drawString(100, 750, text)
|
||||
c.save()
|
||||
buffer.seek(0)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
async def test_unstructured_api_enabled_parsing(
|
||||
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
|
||||
):
|
||||
"""Test that documents are parsed using the Unstructured API when enabled."""
|
||||
test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
|
||||
test_text = "This is a test PDF document for Unstructured API parsing"
|
||||
|
||||
try:
|
||||
# Create a simple PDF
|
||||
pdf_content = create_test_pdf(test_text)
|
||||
|
||||
# Upload the PDF
|
||||
await nc_client.webdav.write_file(
|
||||
test_file, pdf_content, content_type="application/pdf"
|
||||
)
|
||||
logger.info(f"Uploaded PDF file: {test_file}")
|
||||
|
||||
# Read the PDF using MCP tool (should parse via Unstructured API)
|
||||
mcp_result = await nc_mcp_client.call_tool(
|
||||
"nc_webdav_read_file", arguments={"path": test_file}
|
||||
)
|
||||
|
||||
# Extract content from the MCP result
|
||||
if hasattr(mcp_result.content[0], "text"):
|
||||
result_text = mcp_result.content[0].text
|
||||
else:
|
||||
# Fallback for other content types
|
||||
result_text = str(mcp_result.content[0])
|
||||
|
||||
# Parse the JSON response
|
||||
result = json.loads(result_text)
|
||||
|
||||
# Verify the result structure
|
||||
assert "path" in result
|
||||
assert "content" in result
|
||||
assert "content_type" in result
|
||||
assert "parsed" in result # Should be present when parsing succeeds
|
||||
|
||||
# The content should be readable text, not base64
|
||||
content = result["content"]
|
||||
assert isinstance(content, str)
|
||||
assert len(content) > 0
|
||||
assert "test" in content.lower() # Should contain our test text
|
||||
|
||||
# Should have parsing metadata
|
||||
assert "parsing_metadata" in result
|
||||
parsing_metadata = result["parsing_metadata"]
|
||||
assert parsing_metadata["parsing_method"] == "unstructured_api"
|
||||
|
||||
logger.info("Successfully parsed PDF using Unstructured API")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
try:
|
||||
await nc_client.webdav.delete_resource(test_file)
|
||||
except Exception:
|
||||
pass # Ignore cleanup errors
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
async def test_unstructured_api_with_docx(
|
||||
nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
|
||||
):
|
||||
"""Test Unstructured API with DOCX files."""
|
||||
test_file = f"{test_base_path}/test_unstructured_docx.docx"
|
||||
try:
|
||||
# Create a simple DOCX-like file for testing purposes
|
||||
# Since we're removing python-docx dependency, we'll create a simple file
|
||||
docx_content = (
|
||||
b"This is a mock DOCX file content for testing Unstructured API parsing"
|
||||
)
|
||||
|
||||
# Upload the file
|
||||
await nc_client.webdav.write_file(
|
||||
test_file,
|
||||
docx_content,
|
||||
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
logger.info(f"Uploaded DOCX file: {test_file}")
|
||||
|
||||
# Read the file using MCP tool
|
||||
mcp_result = await nc_mcp_client.call_tool(
|
||||
"nc_webdav_read_file", arguments={"path": test_file}
|
||||
)
|
||||
|
||||
# Extract content from the MCP result
|
||||
if hasattr(mcp_result.content[0], "text"):
|
||||
result_text = mcp_result.content[0].text
|
||||
else:
|
||||
# Fallback for other content types
|
||||
result_text = str(mcp_result.content[0])
|
||||
|
||||
# Parse the JSON response
|
||||
result = json.loads(result_text)
|
||||
|
||||
# Verify the result structure
|
||||
assert "path" in result
|
||||
assert "content" in result
|
||||
assert "content_type" in result
|
||||
|
||||
logger.info("Successfully processed DOCX file with Unstructured API")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
try:
|
||||
await nc_client.webdav.delete_resource(test_file)
|
||||
except Exception:
|
||||
pass # Ignore cleanup errors
|
||||
Reference in New Issue
Block a user