fix: Add async/await, PDF metadata, and type safety fixes

This commit addresses multiple issues with async operations, PDF metadata
extraction, and type safety in document processing and search.

## Async/Await Fixes
- processor.py:259 - Added await for chunker.chunk_text(content)
- processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts)
- tests/unit/test_document_chunker.py - Converted all 12 test methods to async

## PDF Metadata Enhancement
- pymupdf.py:143 - Added file_size metadata extraction
- pymupdf.py:145-206 - Refactored to extract text page-by-page
  - Manually loop through pages instead of using page_chunks=True
  - Generate page_boundaries metadata for precise page tracking
  - Works around pymupdf.layout.activate() breaking page_chunks=True
- processor.py:32-66 - Added assign_page_numbers() helper function
  - Assigns page numbers to chunks based on overlap with page boundaries
  - Handles chunks spanning multiple pages
- processor.py:298-300 - Call assign_page_numbers() for PDF files

## Type Safety Fixes
- bm25_hybrid.py:184 - Removed int() conversion of doc_id
- semantic.py:131 - Removed int() conversion of doc_id
- viz_routes.py:275 - Removed int() conversion of doc_id
- Added comments documenting that doc_id can be int (notes) or str (file paths)

## Testing
- All 18 tests passing (12 unit + 6 integration)
- No type errors in modified files
- Container logs show successful processing
- Vector viz searches working correctly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-11-20 02:37:07 +01:00
parent 0f24bdb17a
commit b8010270c1
17 changed files with 1432 additions and 35 deletions
+85 -2
View File
@@ -29,6 +29,43 @@ from nextcloud_mcp_server.vector.scanner import DocumentTask
logger = logging.getLogger(__name__)
def assign_page_numbers(chunks, page_boundaries):
"""Assign page numbers to chunks based on page boundaries.
Each chunk gets the page number where most of its content appears.
For chunks spanning multiple pages, assigns the page containing the
majority of the chunk's characters.
Args:
chunks: List of ChunkWithPosition objects
page_boundaries: List of dicts with {page, start_offset, end_offset}
Returns:
None (modifies chunks in place)
"""
if not page_boundaries:
return
for chunk in chunks:
# Find which page(s) this chunk overlaps with
max_overlap = 0
assigned_page = None
for boundary in page_boundaries:
# Calculate overlap between chunk and page
overlap_start = max(chunk.start_offset, boundary["start_offset"])
overlap_end = min(chunk.end_offset, boundary["end_offset"])
overlap = max(0, overlap_end - overlap_start)
# Assign to page with maximum overlap
if overlap > max_overlap:
max_overlap = overlap
assigned_page = boundary["page"]
if assigned_page is not None:
chunk.page_number = assigned_page
async def processor_task(
worker_id: int,
receive_stream: MemoryObjectReceiveStream[DocumentTask],
@@ -223,6 +260,32 @@ async def _index_document(
content = f"{document['title']}\n\n{document['content']}"
title = document["title"]
etag = document.get("etag", "")
file_metadata = {} # No file-specific metadata for notes
elif doc_task.doc_type == "file":
# For files, doc_id is the file path
file_path = doc_task.doc_id
# Read file content via WebDAV
content_bytes, content_type = await nc_client.webdav.read_file(file_path)
# Use document processor registry to extract text
from nextcloud_mcp_server.document_processors import get_registry
registry = get_registry()
try:
result = await registry.process(
content=content_bytes,
content_type=content_type,
filename=file_path,
)
content = result.text
file_metadata = result.metadata
title = file_metadata.get("title") or file_path.split("/")[-1]
etag = "" # WebDAV read_file doesn't return etag
except Exception as e:
logger.error(f"Failed to process file {file_path}: {e}")
raise
else:
raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
@@ -231,7 +294,11 @@ async def _index_document(
chunk_size=settings.document_chunk_size,
overlap=settings.document_chunk_overlap,
)
chunks = chunker.chunk_text(content)
chunks = await chunker.chunk_text(content)
# Assign page numbers to chunks if page boundaries are available (PDFs)
if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
assign_page_numbers(chunks, file_metadata["page_boundaries"])
# Extract chunk texts for embedding
chunk_texts = [chunk.text for chunk in chunks]
@@ -242,7 +309,7 @@ async def _index_document(
# Generate sparse embeddings (BM25 for keyword matching)
bm25_service = get_bm25_service()
sparse_embeddings = bm25_service.encode_batch(chunk_texts)
sparse_embeddings = await bm25_service.encode_batch(chunk_texts)
# Prepare Qdrant points
indexed_at = int(time.time())
@@ -277,6 +344,22 @@ async def _index_document(
"chunk_start_offset": chunk.start_offset,
"chunk_end_offset": chunk.end_offset,
"metadata_version": 2, # v2 includes position metadata
# File-specific metadata (PDF, etc.)
**(
{
"file_path": doc_task.doc_id,
"mime_type": file_metadata.get("content_type", ""),
"file_size": file_metadata.get("file_size"),
"page_number": chunk.page_number,
"page_count": file_metadata.get("page_count"),
"author": file_metadata.get("author"),
"creation_date": file_metadata.get("creation_date"),
"has_images": file_metadata.get("has_images", False),
"image_count": file_metadata.get("image_count", 0),
}
if doc_task.doc_type == "file"
else {}
),
},
)
)