nextcloud-mcp-server/tests/integration/test_unstructured_api.py

"""Integration tests for Unstructured API functionality."""

import json
import logging
import os
import uuid
from io import BytesIO

import pytest
from mcp.client.session import ClientSession
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

from nextcloud_mcp_server.client import NextcloudClient

logger = logging.getLogger(__name__)


@pytest.fixture
async def test_base_path(nc_client: NextcloudClient):
    """Base path for test files/directories."""
    test_dir = f"mcp_test_unstructured_{uuid.uuid4().hex[:8]}"
    await nc_client.webdav.create_directory(test_dir)
    yield test_dir
    try:
        await nc_client.webdav.delete_resource(test_dir)
    except Exception:
        pass  # Ignore cleanup errors


def create_test_pdf(text: str) -> bytes:
    """Create a simple PDF document with the given text."""
    buffer = BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    c.drawString(100, 750, text)
    c.save()
    buffer.seek(0)
    return buffer.getvalue()


@pytest.mark.skipif(
    condition=os.getenv("ENABLE_UNSTRUCTURED", "false") != "true",
    reason="Unstructured is not enabled",
)
async def test_unstructured_api_enabled_parsing(
    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
):
    """Test that documents are parsed using the Unstructured API when enabled."""
    test_file = f"{test_base_path}/test_unstructured_pdf.pdf"
    test_text = "This is a test PDF document for Unstructured API parsing"

    try:
        # Create a simple PDF
        pdf_content = create_test_pdf(test_text)

        # Upload the PDF
        await nc_client.webdav.write_file(
            test_file, pdf_content, content_type="application/pdf"
        )
        logger.info(f"Uploaded PDF file: {test_file}")

        # Read the PDF using MCP tool (should parse via Unstructured API)
        mcp_result = await nc_mcp_client.call_tool(
            "nc_webdav_read_file", arguments={"path": test_file}
        )

        # Extract content from the MCP result
        if hasattr(mcp_result.content[0], "text"):
            result_text = mcp_result.content[0].text
        else:
            # Fallback for other content types
            result_text = str(mcp_result.content[0])

        # Parse the JSON response
        result = json.loads(result_text)

        # Verify the result structure
        assert "path" in result
        assert "content" in result
        assert "content_type" in result
        assert "parsed" in result  # Should be present when parsing succeeds

        # The content should be readable text, not base64
        content = result["content"]
        assert isinstance(content, str)
        assert len(content) > 0
        assert "test" in content.lower()  # Should contain our test text

        # Should have parsing metadata
        assert "parsing_metadata" in result
        parsing_metadata = result["parsing_metadata"]
        assert parsing_metadata["parsing_method"] == "unstructured_api"

        logger.info("Successfully parsed PDF using Unstructured API")

    finally:
        # Clean up
        try:
            await nc_client.webdav.delete_resource(test_file)
        except Exception:
            pass  # Ignore cleanup errors


@pytest.mark.skipif(
    condition=os.getenv("ENABLE_UNSTRUCTURED", "false") != "true",
    reason="Unstructured is not enabled",
)
async def test_unstructured_api_with_docx(
    nc_client: NextcloudClient, test_base_path: str, nc_mcp_client: ClientSession
):
    """Test Unstructured API with DOCX files."""
    test_file = f"{test_base_path}/test_unstructured_docx.docx"
    try:
        # Create a simple DOCX-like file for testing purposes
        # Since we're removing python-docx dependency, we'll create a simple file
        docx_content = (
            b"This is a mock DOCX file content for testing Unstructured API parsing"
        )

        # Upload the file
        await nc_client.webdav.write_file(
            test_file,
            docx_content,
            content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        )
        logger.info(f"Uploaded DOCX file: {test_file}")

        # Read the file using MCP tool
        mcp_result = await nc_mcp_client.call_tool(
            "nc_webdav_read_file", arguments={"path": test_file}
        )

        # Extract content from the MCP result
        if hasattr(mcp_result.content[0], "text"):
            result_text = mcp_result.content[0].text
        else:
            # Fallback for other content types
            result_text = str(mcp_result.content[0])

        # Parse the JSON response
        result = json.loads(result_text)

        # Verify the result structure
        assert "path" in result
        assert "content" in result
        assert "content_type" in result

        logger.info("Successfully processed DOCX file with Unstructured API")

    finally:
        # Clean up
        try:
            await nc_client.webdav.delete_resource(test_file)
        except Exception:
            pass  # Ignore cleanup errors