b8010270c1
This commit addresses multiple issues with async operations, PDF metadata extraction, and type safety in document processing and search. ## Async/Await Fixes - processor.py:259 - Added await for chunker.chunk_text(content) - processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts) - tests/unit/test_document_chunker.py - Converted all 12 test methods to async ## PDF Metadata Enhancement - pymupdf.py:143 - Added file_size metadata extraction - pymupdf.py:145-206 - Refactored to extract text page-by-page - Manually loop through pages instead of using page_chunks=True - Generate page_boundaries metadata for precise page tracking - Works around pymupdf.layout.activate() breaking page_chunks=True - processor.py:32-66 - Added assign_page_numbers() helper function - Assigns page numbers to chunks based on overlap with page boundaries - Handles chunks spanning multiple pages - processor.py:298-300 - Call assign_page_numbers() for PDF files ## Type Safety Fixes - bm25_hybrid.py:184 - Removed int() conversion of doc_id - semantic.py:131 - Removed int() conversion of doc_id - viz_routes.py:275 - Removed int() conversion of doc_id - Added comments documenting that doc_id can be int (notes) or str (file paths) ## Testing - All 18 tests passing (12 unit + 6 integration) - No type errors in modified files - Container logs show successful processing - Vector viz searches working correctly 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
362 lines
12 KiB
Python
362 lines
12 KiB
Python
"""Integration tests for PDF document indexing and semantic search.
|
|
|
|
These tests validate the complete PDF processing flow:
|
|
1. Process PDF with PyMuPDFProcessor
|
|
2. Chunk extracted text with page numbers
|
|
3. Index chunks into Qdrant with metadata
|
|
4. Perform semantic search on PDF content
|
|
5. Verify page numbers and metadata are preserved
|
|
"""
|
|
|
|
import pymupdf
|
|
import pytest
|
|
from qdrant_client import AsyncQdrantClient
|
|
from qdrant_client.models import Distance, PointStruct, VectorParams
|
|
|
|
from nextcloud_mcp_server.document_processors.pymupdf import PyMuPDFProcessor
|
|
from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider
|
|
from nextcloud_mcp_server.vector.document_chunker import (
|
|
ChunkWithPosition,
|
|
RecursiveCharacterTextSplitter,
|
|
)
|
|
|
|
pytestmark = pytest.mark.integration
|
|
|
|
|
|
def create_test_pdf() -> bytes:
|
|
"""Create a small test PDF with multiple pages."""
|
|
doc = pymupdf.open()
|
|
|
|
# Page 1: Introduction
|
|
page1 = doc.new_page(width=595, height=842) # A4 size
|
|
page1.insert_text(
|
|
(50, 50),
|
|
"Nextcloud Administration Guide\n\n"
|
|
"Chapter 1: Introduction\n\n"
|
|
"Nextcloud is a self-hosted file sharing and collaboration platform. "
|
|
"It provides secure file storage, sharing, and synchronization across devices. "
|
|
"This guide covers installation, configuration, and maintenance of Nextcloud.",
|
|
)
|
|
|
|
# Page 2: Installation
|
|
page2 = doc.new_page(width=595, height=842)
|
|
page2.insert_text(
|
|
(50, 50),
|
|
"Chapter 2: Installation\n\n"
|
|
"System Requirements:\n"
|
|
"- PHP 8.0 or higher\n"
|
|
"- MySQL 8.0 or MariaDB 10.5\n"
|
|
"- Apache or Nginx web server\n\n"
|
|
"Installation steps:\n"
|
|
"1. Download Nextcloud package\n"
|
|
"2. Extract to web server directory\n"
|
|
"3. Configure database connection\n"
|
|
"4. Run installation wizard",
|
|
)
|
|
|
|
# Page 3: Configuration
|
|
page3 = doc.new_page(width=595, height=842)
|
|
page3.insert_text(
|
|
(50, 50),
|
|
"Chapter 3: Configuration\n\n"
|
|
"Database Configuration:\n"
|
|
"Edit config/config.php to set database parameters. "
|
|
"Configure database host, username, password, and database name. "
|
|
"For optimal performance, use MySQL or MariaDB.\n\n"
|
|
"Security Settings:\n"
|
|
"Enable HTTPS, configure trusted domains, and set up firewall rules.",
|
|
)
|
|
|
|
# Convert to bytes
|
|
pdf_bytes = doc.tobytes()
|
|
doc.close()
|
|
|
|
return pdf_bytes
|
|
|
|
|
|
@pytest.fixture
|
|
async def simple_embedding_provider():
|
|
"""Simple in-process embedding provider for testing."""
|
|
return SimpleEmbeddingProvider(dimension=384)
|
|
|
|
|
|
@pytest.fixture
|
|
async def qdrant_test_client():
|
|
"""Qdrant client for testing (in-memory)."""
|
|
client = AsyncQdrantClient(":memory:")
|
|
yield client
|
|
await client.close()
|
|
|
|
|
|
@pytest.fixture
|
|
async def test_collection(qdrant_test_client: AsyncQdrantClient):
|
|
"""Create test collection in Qdrant."""
|
|
collection_name = "test_pdf_indexing"
|
|
|
|
# Create collection
|
|
await qdrant_test_client.create_collection(
|
|
collection_name=collection_name,
|
|
vectors_config=VectorParams(size=384, distance=Distance.COSINE),
|
|
)
|
|
|
|
yield collection_name
|
|
|
|
# Cleanup
|
|
try:
|
|
await qdrant_test_client.delete_collection(collection_name)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
@pytest.fixture
|
|
def pymupdf_processor():
|
|
"""PyMuPDF processor for testing (without image extraction)."""
|
|
return PyMuPDFProcessor(extract_images=False)
|
|
|
|
|
|
async def test_pymupdf_processor_extracts_text_and_metadata(pymupdf_processor):
|
|
"""Test PyMuPDF processor extracts text and metadata from PDF."""
|
|
pdf_bytes = create_test_pdf()
|
|
|
|
result = await pymupdf_processor.process(
|
|
content=pdf_bytes,
|
|
content_type="application/pdf",
|
|
filename="test-admin-guide.pdf",
|
|
)
|
|
|
|
# Verify result structure
|
|
assert result.success is True
|
|
assert result.processor == "pymupdf"
|
|
assert result.text is not None
|
|
assert len(result.text) > 0
|
|
|
|
# Verify extracted text contains expected content
|
|
assert "Nextcloud Administration Guide" in result.text
|
|
assert "Chapter 1: Introduction" in result.text
|
|
assert "Chapter 2: Installation" in result.text
|
|
assert "Chapter 3: Configuration" in result.text
|
|
assert "PHP 8.0 or higher" in result.text
|
|
assert "MySQL" in result.text
|
|
|
|
# Verify metadata
|
|
assert result.metadata is not None
|
|
assert result.metadata["page_count"] == 3
|
|
assert result.metadata["filename"] == "test-admin-guide.pdf"
|
|
assert "format" in result.metadata
|
|
|
|
|
|
async def test_document_chunker_preserves_page_numbers():
|
|
"""Test that document chunker can handle chunks with page number metadata."""
|
|
# Create chunks with page numbers
|
|
chunks = [
|
|
ChunkWithPosition(
|
|
text="Chapter 1 content on page 1",
|
|
start_offset=0,
|
|
end_offset=28,
|
|
page_number=1,
|
|
),
|
|
ChunkWithPosition(
|
|
text="Chapter 2 content on page 2",
|
|
start_offset=29,
|
|
end_offset=57,
|
|
page_number=2,
|
|
),
|
|
ChunkWithPosition(
|
|
text="Chapter 3 content on page 3",
|
|
start_offset=58,
|
|
end_offset=86,
|
|
page_number=3,
|
|
),
|
|
]
|
|
|
|
# Verify page numbers are preserved
|
|
assert chunks[0].page_number == 1
|
|
assert chunks[1].page_number == 2
|
|
assert chunks[2].page_number == 3
|
|
|
|
|
|
async def test_pdf_indexing_and_search_flow(
|
|
pymupdf_processor: PyMuPDFProcessor,
|
|
qdrant_test_client: AsyncQdrantClient,
|
|
test_collection: str,
|
|
simple_embedding_provider: SimpleEmbeddingProvider,
|
|
):
|
|
"""Test complete PDF indexing and semantic search flow."""
|
|
|
|
# Step 1: Process PDF with PyMuPDF
|
|
pdf_bytes = create_test_pdf()
|
|
result = await pymupdf_processor.process(
|
|
content=pdf_bytes,
|
|
content_type="application/pdf",
|
|
filename="/Documents/admin-guide.pdf",
|
|
)
|
|
|
|
assert result.success is True
|
|
assert result.metadata["page_count"] == 3
|
|
|
|
# Step 2: Chunk the extracted text
|
|
# Note: In real implementation, we'd track which chunk came from which page
|
|
# For this test, we'll simulate by creating chunks manually
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
chunks = splitter.split_text(result.text)
|
|
|
|
assert len(chunks) > 0
|
|
|
|
# Step 3: Index chunks into Qdrant with PDF metadata
|
|
points = []
|
|
for idx, chunk_text in enumerate(chunks):
|
|
embedding = await simple_embedding_provider.embed(chunk_text)
|
|
|
|
# Simulate page number assignment (in real implementation, this would be tracked)
|
|
# For simplicity, assign page based on content
|
|
page_number = 1
|
|
if "Chapter 2" in chunk_text or "Installation" in chunk_text:
|
|
page_number = 2
|
|
elif "Chapter 3" in chunk_text or "Configuration" in chunk_text:
|
|
page_number = 3
|
|
|
|
points.append(
|
|
PointStruct(
|
|
id=idx,
|
|
vector=embedding,
|
|
payload={
|
|
"user_id": "admin",
|
|
"doc_id": "/Documents/admin-guide.pdf",
|
|
"doc_type": "file",
|
|
"title": "Nextcloud Administration Guide",
|
|
"file_path": "/Documents/admin-guide.pdf",
|
|
"mime_type": "application/pdf",
|
|
"page_number": page_number,
|
|
"page_count": result.metadata["page_count"],
|
|
"chunk_index": idx,
|
|
"excerpt": chunk_text[:200],
|
|
},
|
|
)
|
|
)
|
|
|
|
await qdrant_test_client.upsert(
|
|
collection_name=test_collection, points=points, wait=True
|
|
)
|
|
|
|
# Step 4: Perform semantic search for installation instructions
|
|
query = "how to install Nextcloud system requirements"
|
|
query_embedding = await simple_embedding_provider.embed(query)
|
|
|
|
response = await qdrant_test_client.query_points(
|
|
collection_name=test_collection,
|
|
query=query_embedding,
|
|
limit=3,
|
|
score_threshold=0.0,
|
|
)
|
|
|
|
# Verify search results
|
|
assert len(response.points) > 0
|
|
|
|
# Top result should be from installation chapter (page 2)
|
|
top_result = response.points[0]
|
|
assert top_result.payload["doc_type"] == "file"
|
|
assert top_result.payload["file_path"] == "/Documents/admin-guide.pdf"
|
|
assert (
|
|
"Installation" in top_result.payload["excerpt"]
|
|
or top_result.payload["page_number"] == 2
|
|
)
|
|
|
|
# Verify page number is preserved
|
|
assert top_result.payload["page_number"] in [1, 2, 3]
|
|
assert top_result.payload["page_count"] == 3
|
|
|
|
# Step 5: Search for configuration
|
|
query = "database configuration settings MySQL"
|
|
query_embedding = await simple_embedding_provider.embed(query)
|
|
|
|
response = await qdrant_test_client.query_points(
|
|
collection_name=test_collection,
|
|
query=query_embedding,
|
|
limit=3,
|
|
score_threshold=0.0,
|
|
)
|
|
|
|
assert len(response.points) > 0
|
|
|
|
# Should find configuration chapter (page 3)
|
|
found_config = any(
|
|
"Configuration" in r.payload["excerpt"] or r.payload["page_number"] == 3
|
|
for r in response.points[:2]
|
|
)
|
|
assert found_config
|
|
|
|
|
|
async def test_pdf_search_with_filters(
|
|
pymupdf_processor: PyMuPDFProcessor,
|
|
qdrant_test_client: AsyncQdrantClient,
|
|
test_collection: str,
|
|
simple_embedding_provider: SimpleEmbeddingProvider,
|
|
):
|
|
"""Test PDF search with metadata filters."""
|
|
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
|
|
|
# Process and index PDF
|
|
pdf_bytes = create_test_pdf()
|
|
result = await pymupdf_processor.process(
|
|
content=pdf_bytes,
|
|
content_type="application/pdf",
|
|
filename="/Documents/admin-guide.pdf",
|
|
)
|
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
chunks = splitter.split_text(result.text)
|
|
|
|
# Index with metadata
|
|
points = []
|
|
for idx, chunk_text in enumerate(chunks):
|
|
embedding = await simple_embedding_provider.embed(chunk_text)
|
|
|
|
points.append(
|
|
PointStruct(
|
|
id=idx,
|
|
vector=embedding,
|
|
payload={
|
|
"user_id": "admin",
|
|
"doc_id": "/Documents/admin-guide.pdf",
|
|
"doc_type": "file",
|
|
"mime_type": "application/pdf",
|
|
"excerpt": chunk_text[:200],
|
|
},
|
|
)
|
|
)
|
|
|
|
await qdrant_test_client.upsert(
|
|
collection_name=test_collection, points=points, wait=True
|
|
)
|
|
|
|
# Search with filter for PDFs only
|
|
query = "Nextcloud installation"
|
|
query_embedding = await simple_embedding_provider.embed(query)
|
|
|
|
response = await qdrant_test_client.query_points(
|
|
collection_name=test_collection,
|
|
query=query_embedding,
|
|
query_filter=Filter(
|
|
must=[FieldCondition(key="doc_type", match=MatchValue(value="file"))]
|
|
),
|
|
limit=3,
|
|
)
|
|
|
|
# All results should be from file documents
|
|
assert len(response.points) > 0
|
|
for result in response.points:
|
|
assert result.payload["doc_type"] == "file"
|
|
assert result.payload["mime_type"] == "application/pdf"
|
|
|
|
|
|
async def test_pymupdf_health_check(pymupdf_processor: PyMuPDFProcessor):
|
|
"""Test PyMuPDF processor health check."""
|
|
is_healthy = await pymupdf_processor.health_check()
|
|
assert is_healthy is True
|
|
|
|
|
|
async def test_pymupdf_supports_pdf_mime_type(pymupdf_processor: PyMuPDFProcessor):
|
|
"""Test PyMuPDF processor declares PDF support."""
|
|
assert "application/pdf" in pymupdf_processor.supported_mime_types
|
|
assert pymupdf_processor.name == "pymupdf"
|