From 6730dd4a4b913888f9014f546ec446e726ba4936 Mon Sep 17 00:00:00 2001 From: yuisheaven <91854357+yuisheaven@users.noreply.github.com> Date: Thu, 23 Oct 2025 22:38:27 +0200 Subject: [PATCH] added new tests for unstructured api (pdf and docx workflow) --- tests/integration/test_unstructured_api.py | 150 +++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 tests/integration/test_unstructured_api.py diff --git a/tests/integration/test_unstructured_api.py b/tests/integration/test_unstructured_api.py new file mode 100644 index 0000000..46015cc --- /dev/null +++ b/tests/integration/test_unstructured_api.py @@ -0,0 +1,150 @@ +"""Integration tests for Unstructured API functionality.""" + +import base64 +import json +import logging +import uuid +from typing import Any, Dict +from io import BytesIO + +import pytest +from mcp.client.session import ClientSession +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +from nextcloud_mcp_server.client import NextcloudClient + +logger = logging.getLogger(__name__) + + +@pytest.fixture +async def test_base_path(nc_client: NextcloudClient): + """Base path for test files/directories.""" + test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}" + await nc_client.webdav.create_directory(test_dir) + yield test_dir + try: + await nc_client.webdav.delete_resource(test_dir) + except Exception: + pass # Ignore cleanup errors + + +def create_test_pdf(text: str) -> bytes: + """Create a simple PDF document with the given text.""" + buffer = BytesIO() + c = canvas.Canvas(buffer, pagesize=letter) + c.drawString(100, 750, text) + c.save() + buffer.seek(0) + return buffer.getvalue() + + +@pytest.mark.integration +async def test_unstructured_api_enabled_parsing( + nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession +): + """Test that documents are parsed using the Unstructured API when enabled.""" + test_file = f"{test_base_path}/test_unstructured_pdf.pdf" + test_text = "This is a test PDF document for Unstructured API parsing" + + try: + # Create a simple PDF + pdf_content = create_test_pdf(test_text) + + # Upload the PDF + await nc_client.webdav.write_file( + test_file, pdf_content, content_type="application/pdf" + ) + logger.info(f"Uploaded PDF file: {test_file}") + + # Read the PDF using MCP tool (should parse via Unstructured API) + mcp_result = await nc_mcp_client.call_tool( + "nc_webdav_read_file", arguments={"path": test_file} + ) + + # Extract content from the MCP result + if hasattr(mcp_result.content[0], "text"): + result_text = mcp_result.content[0].text + else: + # Fallback for other content types + result_text = str(mcp_result.content[0]) + + # Parse the JSON response + result = json.loads(result_text) + + # Verify the result structure + assert "path" in result + assert "content" in result + assert "content_type" in result + assert "parsed" in result # Should be present when parsing succeeds + + # The content should be readable text, not base64 + content = result["content"] + assert isinstance(content, str) + assert len(content) > 0 + assert "test" in content.lower() # Should contain our test text + + # Should have parsing metadata + assert "parsing_metadata" in result + parsing_metadata = result["parsing_metadata"] + assert parsing_metadata["parsing_method"] == "unstructured_api" + + logger.info("Successfully parsed PDF using Unstructured API") + + finally: + # Clean up + try: + await nc_client.webdav.delete_resource(test_file) + except Exception: + pass # Ignore cleanup errors + + +@pytest.mark.integration +async def test_unstructured_api_with_docx( + nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession +): + """Test Unstructured API with DOCX files.""" + test_file = f"{test_base_path}/test_unstructured_docx.docx" + try: + # Create a simple DOCX-like file for testing purposes + # Since we're removing python-docx dependency, we'll create a simple file + docx_content = ( + b"This is a mock DOCX file content for testing Unstructured API parsing" + ) + + # Upload the file + await nc_client.webdav.write_file( + test_file, + docx_content, + content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + logger.info(f"Uploaded DOCX file: {test_file}") + + # Read the file using MCP tool + mcp_result = await nc_mcp_client.call_tool( + "nc_webdav_read_file", arguments={"path": test_file} + ) + + # Extract content from the MCP result + if hasattr(mcp_result.content[0], "text"): + result_text = mcp_result.content[0].text + else: + # Fallback for other content types + result_text = str(mcp_result.content[0]) + + # Parse the JSON response + result = json.loads(result_text) + + # Verify the result structure + assert "path" in result + assert "content" in result + assert "content_type" in result + + logger.info("Successfully processed DOCX file with Unstructured API") + + finally: + # Clean up + try: + await nc_client.webdav.delete_resource(test_file) + except Exception: + pass # Ignore cleanup errors