Files
Chris Coutinho b8010270c1 fix: Add async/await, PDF metadata, and type safety fixes
This commit addresses multiple issues with async operations, PDF metadata
extraction, and type safety in document processing and search.

## Async/Await Fixes
- processor.py:259 - Added await for chunker.chunk_text(content)
- processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts)
- tests/unit/test_document_chunker.py - Converted all 12 test methods to async

## PDF Metadata Enhancement
- pymupdf.py:143 - Added file_size metadata extraction
- pymupdf.py:145-206 - Refactored to extract text page-by-page
  - Manually loop through pages instead of using page_chunks=True
  - Generate page_boundaries metadata for precise page tracking
  - Works around pymupdf.layout.activate() breaking page_chunks=True
- processor.py:32-66 - Added assign_page_numbers() helper function
  - Assigns page numbers to chunks based on overlap with page boundaries
  - Handles chunks spanning multiple pages
- processor.py:298-300 - Call assign_page_numbers() for PDF files

## Type Safety Fixes
- bm25_hybrid.py:184 - Removed int() conversion of doc_id
- semantic.py:131 - Removed int() conversion of doc_id
- viz_routes.py:275 - Removed int() conversion of doc_id
- Added comments documenting that doc_id can be int (notes) or str (file paths)

## Testing
- All 18 tests passing (12 unit + 6 integration)
- No type errors in modified files
- Container logs show successful processing
- Vector viz searches working correctly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 02:37:07 +01:00

362 lines
12 KiB
Python

"""Integration tests for PDF document indexing and semantic search.
These tests validate the complete PDF processing flow:
1. Process PDF with PyMuPDFProcessor
2. Chunk extracted text with page numbers
3. Index chunks into Qdrant with metadata
4. Perform semantic search on PDF content
5. Verify page numbers and metadata are preserved
"""
import pymupdf
import pytest
from qdrant_client import AsyncQdrantClient
from qdrant_client.models import Distance, PointStruct, VectorParams
from nextcloud_mcp_server.document_processors.pymupdf import PyMuPDFProcessor
from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider
from nextcloud_mcp_server.vector.document_chunker import (
ChunkWithPosition,
RecursiveCharacterTextSplitter,
)
pytestmark = pytest.mark.integration
def create_test_pdf() -> bytes:
"""Create a small test PDF with multiple pages."""
doc = pymupdf.open()
# Page 1: Introduction
page1 = doc.new_page(width=595, height=842) # A4 size
page1.insert_text(
(50, 50),
"Nextcloud Administration Guide\n\n"
"Chapter 1: Introduction\n\n"
"Nextcloud is a self-hosted file sharing and collaboration platform. "
"It provides secure file storage, sharing, and synchronization across devices. "
"This guide covers installation, configuration, and maintenance of Nextcloud.",
)
# Page 2: Installation
page2 = doc.new_page(width=595, height=842)
page2.insert_text(
(50, 50),
"Chapter 2: Installation\n\n"
"System Requirements:\n"
"- PHP 8.0 or higher\n"
"- MySQL 8.0 or MariaDB 10.5\n"
"- Apache or Nginx web server\n\n"
"Installation steps:\n"
"1. Download Nextcloud package\n"
"2. Extract to web server directory\n"
"3. Configure database connection\n"
"4. Run installation wizard",
)
# Page 3: Configuration
page3 = doc.new_page(width=595, height=842)
page3.insert_text(
(50, 50),
"Chapter 3: Configuration\n\n"
"Database Configuration:\n"
"Edit config/config.php to set database parameters. "
"Configure database host, username, password, and database name. "
"For optimal performance, use MySQL or MariaDB.\n\n"
"Security Settings:\n"
"Enable HTTPS, configure trusted domains, and set up firewall rules.",
)
# Convert to bytes
pdf_bytes = doc.tobytes()
doc.close()
return pdf_bytes
@pytest.fixture
async def simple_embedding_provider():
"""Simple in-process embedding provider for testing."""
return SimpleEmbeddingProvider(dimension=384)
@pytest.fixture
async def qdrant_test_client():
"""Qdrant client for testing (in-memory)."""
client = AsyncQdrantClient(":memory:")
yield client
await client.close()
@pytest.fixture
async def test_collection(qdrant_test_client: AsyncQdrantClient):
"""Create test collection in Qdrant."""
collection_name = "test_pdf_indexing"
# Create collection
await qdrant_test_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)
yield collection_name
# Cleanup
try:
await qdrant_test_client.delete_collection(collection_name)
except Exception:
pass
@pytest.fixture
def pymupdf_processor():
"""PyMuPDF processor for testing (without image extraction)."""
return PyMuPDFProcessor(extract_images=False)
async def test_pymupdf_processor_extracts_text_and_metadata(pymupdf_processor):
"""Test PyMuPDF processor extracts text and metadata from PDF."""
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="test-admin-guide.pdf",
)
# Verify result structure
assert result.success is True
assert result.processor == "pymupdf"
assert result.text is not None
assert len(result.text) > 0
# Verify extracted text contains expected content
assert "Nextcloud Administration Guide" in result.text
assert "Chapter 1: Introduction" in result.text
assert "Chapter 2: Installation" in result.text
assert "Chapter 3: Configuration" in result.text
assert "PHP 8.0 or higher" in result.text
assert "MySQL" in result.text
# Verify metadata
assert result.metadata is not None
assert result.metadata["page_count"] == 3
assert result.metadata["filename"] == "test-admin-guide.pdf"
assert "format" in result.metadata
async def test_document_chunker_preserves_page_numbers():
"""Test that document chunker can handle chunks with page number metadata."""
# Create chunks with page numbers
chunks = [
ChunkWithPosition(
text="Chapter 1 content on page 1",
start_offset=0,
end_offset=28,
page_number=1,
),
ChunkWithPosition(
text="Chapter 2 content on page 2",
start_offset=29,
end_offset=57,
page_number=2,
),
ChunkWithPosition(
text="Chapter 3 content on page 3",
start_offset=58,
end_offset=86,
page_number=3,
),
]
# Verify page numbers are preserved
assert chunks[0].page_number == 1
assert chunks[1].page_number == 2
assert chunks[2].page_number == 3
async def test_pdf_indexing_and_search_flow(
pymupdf_processor: PyMuPDFProcessor,
qdrant_test_client: AsyncQdrantClient,
test_collection: str,
simple_embedding_provider: SimpleEmbeddingProvider,
):
"""Test complete PDF indexing and semantic search flow."""
# Step 1: Process PDF with PyMuPDF
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="/Documents/admin-guide.pdf",
)
assert result.success is True
assert result.metadata["page_count"] == 3
# Step 2: Chunk the extracted text
# Note: In real implementation, we'd track which chunk came from which page
# For this test, we'll simulate by creating chunks manually
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(result.text)
assert len(chunks) > 0
# Step 3: Index chunks into Qdrant with PDF metadata
points = []
for idx, chunk_text in enumerate(chunks):
embedding = await simple_embedding_provider.embed(chunk_text)
# Simulate page number assignment (in real implementation, this would be tracked)
# For simplicity, assign page based on content
page_number = 1
if "Chapter 2" in chunk_text or "Installation" in chunk_text:
page_number = 2
elif "Chapter 3" in chunk_text or "Configuration" in chunk_text:
page_number = 3
points.append(
PointStruct(
id=idx,
vector=embedding,
payload={
"user_id": "admin",
"doc_id": "/Documents/admin-guide.pdf",
"doc_type": "file",
"title": "Nextcloud Administration Guide",
"file_path": "/Documents/admin-guide.pdf",
"mime_type": "application/pdf",
"page_number": page_number,
"page_count": result.metadata["page_count"],
"chunk_index": idx,
"excerpt": chunk_text[:200],
},
)
)
await qdrant_test_client.upsert(
collection_name=test_collection, points=points, wait=True
)
# Step 4: Perform semantic search for installation instructions
query = "how to install Nextcloud system requirements"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
limit=3,
score_threshold=0.0,
)
# Verify search results
assert len(response.points) > 0
# Top result should be from installation chapter (page 2)
top_result = response.points[0]
assert top_result.payload["doc_type"] == "file"
assert top_result.payload["file_path"] == "/Documents/admin-guide.pdf"
assert (
"Installation" in top_result.payload["excerpt"]
or top_result.payload["page_number"] == 2
)
# Verify page number is preserved
assert top_result.payload["page_number"] in [1, 2, 3]
assert top_result.payload["page_count"] == 3
# Step 5: Search for configuration
query = "database configuration settings MySQL"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
limit=3,
score_threshold=0.0,
)
assert len(response.points) > 0
# Should find configuration chapter (page 3)
found_config = any(
"Configuration" in r.payload["excerpt"] or r.payload["page_number"] == 3
for r in response.points[:2]
)
assert found_config
async def test_pdf_search_with_filters(
pymupdf_processor: PyMuPDFProcessor,
qdrant_test_client: AsyncQdrantClient,
test_collection: str,
simple_embedding_provider: SimpleEmbeddingProvider,
):
"""Test PDF search with metadata filters."""
from qdrant_client.models import FieldCondition, Filter, MatchValue
# Process and index PDF
pdf_bytes = create_test_pdf()
result = await pymupdf_processor.process(
content=pdf_bytes,
content_type="application/pdf",
filename="/Documents/admin-guide.pdf",
)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(result.text)
# Index with metadata
points = []
for idx, chunk_text in enumerate(chunks):
embedding = await simple_embedding_provider.embed(chunk_text)
points.append(
PointStruct(
id=idx,
vector=embedding,
payload={
"user_id": "admin",
"doc_id": "/Documents/admin-guide.pdf",
"doc_type": "file",
"mime_type": "application/pdf",
"excerpt": chunk_text[:200],
},
)
)
await qdrant_test_client.upsert(
collection_name=test_collection, points=points, wait=True
)
# Search with filter for PDFs only
query = "Nextcloud installation"
query_embedding = await simple_embedding_provider.embed(query)
response = await qdrant_test_client.query_points(
collection_name=test_collection,
query=query_embedding,
query_filter=Filter(
must=[FieldCondition(key="doc_type", match=MatchValue(value="file"))]
),
limit=3,
)
# All results should be from file documents
assert len(response.points) > 0
for result in response.points:
assert result.payload["doc_type"] == "file"
assert result.payload["mime_type"] == "application/pdf"
async def test_pymupdf_health_check(pymupdf_processor: PyMuPDFProcessor):
"""Test PyMuPDF processor health check."""
is_healthy = await pymupdf_processor.health_check()
assert is_healthy is True
async def test_pymupdf_supports_pdf_mime_type(pymupdf_processor: PyMuPDFProcessor):
"""Test PyMuPDF processor declares PDF support."""
assert "application/pdf" in pymupdf_processor.supported_mime_types
assert pymupdf_processor.name == "pymupdf"