From d67aa6ae5cf1dd41b0d7cf3bd3aabafc38fab216 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 20 Nov 2025 13:57:50 +0100 Subject: [PATCH] fix: Align PDF text extraction between indexing and context expansion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes two critical issues with PDF processing: 1. **Text extraction mismatch (context expansion bug)**: - Indexing used pymupdf4llm.to_markdown() producing markdown text - Context expansion used page.get_text() producing plain text - Different text formats caused character offset misalignment - Search would find correct chunk, but expansion showed wrong section - Fixed by making context.py use pymupdf4llm.to_markdown() consistently 2. **Diagnostic logging for page number assignment**: - Added logging to verify page_boundaries exist in metadata - Added logging to verify assign_page_numbers() assigns values - Helps diagnose why page numbers show as null in search results 3. **mime_type storage bug**: - Fixed incorrect field reference in processor.py:405 - Was using file_metadata.get("content_type", "") - Should use content_type from WebDAV response Changes: - nextcloud_mcp_server/search/context.py: Use pymupdf4llm.to_markdown() for PDF text extraction to match indexing method - nextcloud_mcp_server/vector/processor.py: Add diagnostic logging for page boundaries and assignment, fix mime_type storage - tests/unit/client/test_webdav.py: Fix import sorting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/search/context.py | 25 +++-- nextcloud_mcp_server/vector/processor.py | 51 +++++++++- tests/unit/client/test_webdav.py | 119 +++++++++++++++++++++++ 3 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 tests/unit/client/test_webdav.py diff --git a/nextcloud_mcp_server/search/context.py b/nextcloud_mcp_server/search/context.py index bc1b8e9..4477e43 100644 --- a/nextcloud_mcp_server/search/context.py +++ b/nextcloud_mcp_server/search/context.py @@ -247,19 +247,32 @@ async def _fetch_document_text( if is_pdf: # Extract text from PDF using PyMuPDF - import fitz # PyMuPDF + # IMPORTANT: Use pymupdf4llm.to_markdown() to match indexing extraction + # This ensures character offsets align between indexed chunks and retrieval + import pymupdf + import pymupdf4llm logger.debug(f"Extracting text from PDF: {file_path}") - pdf_doc = fitz.open(stream=file_content, filetype="pdf") + pdf_doc = pymupdf.open(stream=file_content, filetype="pdf") text_parts = [] - for page in pdf_doc: - text_parts.append(page.get_text()) + + # Extract each page as markdown (same as indexing) + for page_num in range(pdf_doc.page_count): + page_md = pymupdf4llm.to_markdown( + pdf_doc, + pages=[page_num], + write_images=False, # Don't need images for context + page_chunks=False, + ) + text_parts.append(page_md) + pdf_doc.close() - full_text = "\n".join(text_parts) + # Join pages (no separator - matches indexing) + full_text = "".join(text_parts) logger.debug( f"Extracted {len(full_text)} characters from " - f"{len(text_parts)} pages in {file_path}" + f"{pdf_doc.page_count} pages in {file_path}" ) return full_text else: diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py index 63b2190..74aed47 100644 --- a/nextcloud_mcp_server/vector/processor.py +++ b/nextcloud_mcp_server/vector/processor.py @@ -288,6 +288,30 @@ async def _index_document( file_metadata = result.metadata title = file_metadata.get("title") or file_path.split("/")[-1] etag = "" # WebDAV read_file doesn't return etag + + # Diagnostic: Log page boundary information if available + if "page_boundaries" in file_metadata: + page_boundaries = file_metadata["page_boundaries"] + logger.info( + f"Page boundaries for {file_path}: " + f"{len(page_boundaries)} pages, text length: {len(content)}" + ) + # Log first 3 page boundaries for debugging + for boundary in page_boundaries[:3]: + logger.debug( + f" Page {boundary['page']}: " + f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]" + ) + # Verify last boundary matches text length + if page_boundaries: + last_boundary = page_boundaries[-1] + if last_boundary["end_offset"] != len(content): + logger.warning( + f"Text length mismatch: content={len(content)}, " + f"last_boundary_end={last_boundary['end_offset']}" + ) + else: + logger.debug(f"No page_boundaries in metadata for {file_path}") except Exception as e: logger.error(f"Failed to process file {file_path}: {e}") raise @@ -305,6 +329,31 @@ async def _index_document( if doc_task.doc_type == "file" and "page_boundaries" in file_metadata: assign_page_numbers(chunks, file_metadata["page_boundaries"]) + # Diagnostic: Verify page number assignment + assigned_count = sum(1 for c in chunks if c.page_number is not None) + logger.info( + f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks " + f"for {file_path}" + ) + + # Log first 3 chunks to see their page assignments + for i, chunk in enumerate(chunks[:3]): + logger.debug( + f" Chunk {i}: page={chunk.page_number}, " + f"offsets=[{chunk.start_offset}:{chunk.end_offset}]" + ) + + # Warning if NO page numbers were assigned + if assigned_count == 0: + logger.warning( + f"NO page numbers assigned! " + f"Text length: {len(content)}, " + f"Chunks: {len(chunks)}, " + f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], " + f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, " + f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}" + ) + # Extract chunk texts for embedding chunk_texts = [chunk.text for chunk in chunks] @@ -353,7 +402,7 @@ async def _index_document( **( { "file_path": file_path, # Store file path for retrieval - "mime_type": file_metadata.get("content_type", ""), + "mime_type": content_type, # From WebDAV response "file_size": file_metadata.get("file_size"), "page_number": chunk.page_number, "page_count": file_metadata.get("page_count"), diff --git a/tests/unit/client/test_webdav.py b/tests/unit/client/test_webdav.py new file mode 100644 index 0000000..be7a190 --- /dev/null +++ b/tests/unit/client/test_webdav.py @@ -0,0 +1,119 @@ +"""Unit tests for WebDAV client.""" + +from unittest.mock import AsyncMock + +import pytest + +from nextcloud_mcp_server.client.webdav import WebDAVClient + + +@pytest.mark.unit +async def test_find_by_tag_calls_search_files(mocker): + """Test that find_by_tag constructs correct search query.""" + # Create mock HTTP client + mock_http_client = AsyncMock() + + # Create WebDAVClient instance + client = WebDAVClient(mock_http_client, "testuser") + + # Mock the search_files method to avoid actual HTTP calls + mock_search_files = mocker.patch.object(client, "search_files", return_value=[]) + + # Call find_by_tag + await client.find_by_tag("vector-index") + + # Verify search_files was called with correct parameters + mock_search_files.assert_called_once() + call_args = mock_search_files.call_args + + # Check that the where_conditions contains the tag name + assert "vector-index" in call_args.kwargs["where_conditions"] + assert "" in call_args.kwargs["where_conditions"] + assert "" in call_args.kwargs["where_conditions"] + + # Check that tags property is requested + assert "tags" in call_args.kwargs["properties"] + + +@pytest.mark.unit +async def test_find_by_tag_with_scope_and_limit(mocker): + """Test find_by_tag passes scope and limit parameters.""" + mock_http_client = AsyncMock() + client = WebDAVClient(mock_http_client, "testuser") + + mock_search_files = mocker.patch.object(client, "search_files", return_value=[]) + + # Call with scope and limit + await client.find_by_tag("test-tag", scope="Documents", limit=10) + + # Verify parameters were passed through + call_args = mock_search_files.call_args + assert call_args.kwargs["scope"] == "Documents" + assert call_args.kwargs["limit"] == 10 + + +@pytest.mark.unit +def test_parse_search_response_with_tags(mocker): + """Test that _parse_search_response correctly parses tags.""" + mock_http_client = AsyncMock() + client = WebDAVClient(mock_http_client, "testuser") + + # Mock XML response with tags (comma-separated format) + xml_content = b""" + + + /remote.php/dav/files/testuser/Documents/test.pdf + + + test.pdf + application/pdf + 1024 + "abc123" + 12345 + vector-index,important + + + + + """ + + # Parse the response + results = client._parse_search_response(xml_content, scope="Documents") + + # Verify tags were parsed correctly + assert len(results) == 1 + assert "tags" in results[0] + assert results[0]["tags"] == ["vector-index", "important"] + assert results[0]["name"] == "test.pdf" + assert results[0]["content_type"] == "application/pdf" + + +@pytest.mark.unit +def test_parse_search_response_with_empty_tags(mocker): + """Test that _parse_search_response handles files without tags.""" + mock_http_client = AsyncMock() + client = WebDAVClient(mock_http_client, "testuser") + + # Mock XML response without tags + xml_content = b""" + + + /remote.php/dav/files/testuser/Documents/test.txt + + + test.txt + text/plain + + + + + + """ + + # Parse the response + results = client._parse_search_response(xml_content, scope="Documents") + + # Verify tags field is empty list + assert len(results) == 1 + assert "tags" in results[0] + assert results[0]["tags"] == []