added new tests for unstructured api (pdf and docx workflow)

2025-10-23 22:38:27 +02:00
parent 8734c4b292
commit 6730dd4a4b
1 changed files with 150 additions and 0 deletions
@@ -0,0 +1,150 @@
+"""Integration tests for Unstructured API functionality."""
+
+import base64
+import json
+import logging
+import uuid
+from typing import Any, Dict
+from io import BytesIO
+
+import pytest
+from mcp.client.session import ClientSession
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+
+from nextcloud_mcp_server.client import NextcloudClient
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+async def test_base_path(nc_client: NextcloudClient):
+    """Base path for test files/directories."""
+    test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
+    await nc_client.webdav.create_directory(test_dir)
+    yield test_dir
+    try:
+        await nc_client.webdav.delete_resource(test_dir)
+    except Exception:
+        pass  # Ignore cleanup errors
+
+
+def create_test_pdf(text: str) -> bytes:
+    """Create a simple PDF document with the given text."""
+    buffer = BytesIO()
+    c = canvas.Canvas(buffer, pagesize=letter)
+    c.drawString(100, 750, text)
+    c.save()
+    buffer.seek(0)
+    return buffer.getvalue()
+
+
+@pytest.mark.integration
+async def test_unstructured_api_enabled_parsing(
+    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
+):
+    """Test that documents are parsed using the Unstructured API when enabled."""
+    test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
+    test_text = "This is a test PDF document for Unstructured API parsing"
+
+    try:
+        # Create a simple PDF
+        pdf_content = create_test_pdf(test_text)
+
+        # Upload the PDF
+        await nc_client.webdav.write_file(
+            test_file, pdf_content, content_type="application/pdf"
+        )
+        logger.info(f"Uploaded PDF file: {test_file}")
+
+        # Read the PDF using MCP tool (should parse via Unstructured API)
+        mcp_result = await nc_mcp_client.call_tool(
+            "nc_webdav_read_file", arguments={"path": test_file}
+        )
+
+        # Extract content from the MCP result
+        if hasattr(mcp_result.content[0], "text"):
+            result_text = mcp_result.content[0].text
+        else:
+            # Fallback for other content types
+            result_text = str(mcp_result.content[0])
+
+        # Parse the JSON response
+        result = json.loads(result_text)
+
+        # Verify the result structure
+        assert "path" in result
+        assert "content" in result
+        assert "content_type" in result
+        assert "parsed" in result  # Should be present when parsing succeeds
+
+        # The content should be readable text, not base64
+        content = result["content"]
+        assert isinstance(content, str)
+        assert len(content) > 0
+        assert "test" in content.lower()  # Should contain our test text
+
+        # Should have parsing metadata
+        assert "parsing_metadata" in result
+        parsing_metadata = result["parsing_metadata"]
+        assert parsing_metadata["parsing_method"] == "unstructured_api"
+
+        logger.info("Successfully parsed PDF using Unstructured API")
+
+    finally:
+        # Clean up
+        try:
+            await nc_client.webdav.delete_resource(test_file)
+        except Exception:
+            pass  # Ignore cleanup errors
+
+
+@pytest.mark.integration
+async def test_unstructured_api_with_docx(
+    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
+):
+    """Test Unstructured API with DOCX files."""
+    test_file = f"{test_base_path}/test_unstructured_docx.docx"
+    try:
+        # Create a simple DOCX-like file for testing purposes
+        # Since we're removing python-docx dependency, we'll create a simple file
+        docx_content = (
+            b"This is a mock DOCX file content for testing Unstructured API parsing"
+        )
+
+        # Upload the file
+        await nc_client.webdav.write_file(
+            test_file,
+            docx_content,
+            content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        )
+        logger.info(f"Uploaded DOCX file: {test_file}")
+
+        # Read the file using MCP tool
+        mcp_result = await nc_mcp_client.call_tool(
+            "nc_webdav_read_file", arguments={"path": test_file}
+        )
+
+        # Extract content from the MCP result
+        if hasattr(mcp_result.content[0], "text"):
+            result_text = mcp_result.content[0].text
+        else:
+            # Fallback for other content types
+            result_text = str(mcp_result.content[0])
+
+        # Parse the JSON response
+        result = json.loads(result_text)
+
+        # Verify the result structure
+        assert "path" in result
+        assert "content" in result
+        assert "content_type" in result
+
+        logger.info("Successfully processed DOCX file with Unstructured API")
+
+    finally:
+        # Clean up
+        try:
+            await nc_client.webdav.delete_resource(test_file)
+        except Exception:
+            pass  # Ignore cleanup errors