fix: Add async/await, PDF metadata, and type safety fixes
This commit addresses multiple issues with async operations, PDF metadata extraction, and type safety in document processing and search. ## Async/Await Fixes - processor.py:259 - Added await for chunker.chunk_text(content) - processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts) - tests/unit/test_document_chunker.py - Converted all 12 test methods to async ## PDF Metadata Enhancement - pymupdf.py:143 - Added file_size metadata extraction - pymupdf.py:145-206 - Refactored to extract text page-by-page - Manually loop through pages instead of using page_chunks=True - Generate page_boundaries metadata for precise page tracking - Works around pymupdf.layout.activate() breaking page_chunks=True - processor.py:32-66 - Added assign_page_numbers() helper function - Assigns page numbers to chunks based on overlap with page boundaries - Handles chunks spanning multiple pages - processor.py:298-300 - Call assign_page_numbers() for PDF files ## Type Safety Fixes - bm25_hybrid.py:184 - Removed int() conversion of doc_id - semantic.py:131 - Removed int() conversion of doc_id - viz_routes.py:275 - Removed int() conversion of doc_id - Added comments documenting that doc_id can be int (notes) or str (file paths) ## Testing - All 18 tests passing (12 unit + 6 integration) - No type errors in modified files - Container logs show successful processing - Vector viz searches working correctly 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -29,6 +29,43 @@ from nextcloud_mcp_server.vector.scanner import DocumentTask
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def assign_page_numbers(chunks, page_boundaries):
|
||||
"""Assign page numbers to chunks based on page boundaries.
|
||||
|
||||
Each chunk gets the page number where most of its content appears.
|
||||
For chunks spanning multiple pages, assigns the page containing the
|
||||
majority of the chunk's characters.
|
||||
|
||||
Args:
|
||||
chunks: List of ChunkWithPosition objects
|
||||
page_boundaries: List of dicts with {page, start_offset, end_offset}
|
||||
|
||||
Returns:
|
||||
None (modifies chunks in place)
|
||||
"""
|
||||
if not page_boundaries:
|
||||
return
|
||||
|
||||
for chunk in chunks:
|
||||
# Find which page(s) this chunk overlaps with
|
||||
max_overlap = 0
|
||||
assigned_page = None
|
||||
|
||||
for boundary in page_boundaries:
|
||||
# Calculate overlap between chunk and page
|
||||
overlap_start = max(chunk.start_offset, boundary["start_offset"])
|
||||
overlap_end = min(chunk.end_offset, boundary["end_offset"])
|
||||
overlap = max(0, overlap_end - overlap_start)
|
||||
|
||||
# Assign to page with maximum overlap
|
||||
if overlap > max_overlap:
|
||||
max_overlap = overlap
|
||||
assigned_page = boundary["page"]
|
||||
|
||||
if assigned_page is not None:
|
||||
chunk.page_number = assigned_page
|
||||
|
||||
|
||||
async def processor_task(
|
||||
worker_id: int,
|
||||
receive_stream: MemoryObjectReceiveStream[DocumentTask],
|
||||
@@ -223,6 +260,32 @@ async def _index_document(
|
||||
content = f"{document['title']}\n\n{document['content']}"
|
||||
title = document["title"]
|
||||
etag = document.get("etag", "")
|
||||
file_metadata = {} # No file-specific metadata for notes
|
||||
elif doc_task.doc_type == "file":
|
||||
# For files, doc_id is the file path
|
||||
file_path = doc_task.doc_id
|
||||
|
||||
# Read file content via WebDAV
|
||||
content_bytes, content_type = await nc_client.webdav.read_file(file_path)
|
||||
|
||||
# Use document processor registry to extract text
|
||||
from nextcloud_mcp_server.document_processors import get_registry
|
||||
|
||||
registry = get_registry()
|
||||
|
||||
try:
|
||||
result = await registry.process(
|
||||
content=content_bytes,
|
||||
content_type=content_type,
|
||||
filename=file_path,
|
||||
)
|
||||
content = result.text
|
||||
file_metadata = result.metadata
|
||||
title = file_metadata.get("title") or file_path.split("/")[-1]
|
||||
etag = "" # WebDAV read_file doesn't return etag
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process file {file_path}: {e}")
|
||||
raise
|
||||
else:
|
||||
raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
|
||||
|
||||
@@ -231,7 +294,11 @@ async def _index_document(
|
||||
chunk_size=settings.document_chunk_size,
|
||||
overlap=settings.document_chunk_overlap,
|
||||
)
|
||||
chunks = chunker.chunk_text(content)
|
||||
chunks = await chunker.chunk_text(content)
|
||||
|
||||
# Assign page numbers to chunks if page boundaries are available (PDFs)
|
||||
if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
|
||||
assign_page_numbers(chunks, file_metadata["page_boundaries"])
|
||||
|
||||
# Extract chunk texts for embedding
|
||||
chunk_texts = [chunk.text for chunk in chunks]
|
||||
@@ -242,7 +309,7 @@ async def _index_document(
|
||||
|
||||
# Generate sparse embeddings (BM25 for keyword matching)
|
||||
bm25_service = get_bm25_service()
|
||||
sparse_embeddings = bm25_service.encode_batch(chunk_texts)
|
||||
sparse_embeddings = await bm25_service.encode_batch(chunk_texts)
|
||||
|
||||
# Prepare Qdrant points
|
||||
indexed_at = int(time.time())
|
||||
@@ -277,6 +344,22 @@ async def _index_document(
|
||||
"chunk_start_offset": chunk.start_offset,
|
||||
"chunk_end_offset": chunk.end_offset,
|
||||
"metadata_version": 2, # v2 includes position metadata
|
||||
# File-specific metadata (PDF, etc.)
|
||||
**(
|
||||
{
|
||||
"file_path": doc_task.doc_id,
|
||||
"mime_type": file_metadata.get("content_type", ""),
|
||||
"file_size": file_metadata.get("file_size"),
|
||||
"page_number": chunk.page_number,
|
||||
"page_count": file_metadata.get("page_count"),
|
||||
"author": file_metadata.get("author"),
|
||||
"creation_date": file_metadata.get("creation_date"),
|
||||
"has_images": file_metadata.get("has_images", False),
|
||||
"image_count": file_metadata.get("image_count", 0),
|
||||
}
|
||||
if doc_task.doc_type == "file"
|
||||
else {}
|
||||
),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user