fix: Align PDF text extraction between indexing and context expansion

This commit fixes two critical issues with PDF processing: 1. **Text extraction mismatch (context expansion bug)**: - Indexing used pymupdf4llm.to_markdown() producing markdown text - Context expansion used page.get_text() producing plain text - Different text formats caused character offset misalignment - Search would find correct chunk, but expansion showed wrong section - Fixed by making context.py use pymupdf4llm.to_markdown() consistently 2. **Diagnostic logging for page number assignment**: - Added logging to verify page_boundaries exist in metadata - Added logging to verify assign_page_numbers() assigns values - Helps diagnose why page numbers show as null in search results 3. **mime_type storage bug**: - Fixed incorrect field reference in processor.py:405 - Was using file_metadata.get("content_type", "") - Should use content_type from WebDAV response Changes: - nextcloud_mcp_server/search/context.py: Use pymupdf4llm.to_markdown() for PDF text extraction to match indexing method - nextcloud_mcp_server/vector/processor.py: Add diagnostic logging for page boundaries and assignment, fix mime_type storage - tests/unit/client/test_webdav.py: Fix import sorting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 13:57:50 +01:00
parent f1a5fac1b9
commit d67aa6ae5c
3 changed files with 188 additions and 7 deletions
@@ -247,19 +247,32 @@ async def _fetch_document_text(

                if is_pdf:
                    # Extract text from PDF using PyMuPDF
-                    import fitz  # PyMuPDF
+                    # IMPORTANT: Use pymupdf4llm.to_markdown() to match indexing extraction
+                    # This ensures character offsets align between indexed chunks and retrieval
+                    import pymupdf
+                    import pymupdf4llm

                    logger.debug(f"Extracting text from PDF: {file_path}")
-                    pdf_doc = fitz.open(stream=file_content, filetype="pdf")
+                    pdf_doc = pymupdf.open(stream=file_content, filetype="pdf")
                    text_parts = []
-                    for page in pdf_doc:
-                        text_parts.append(page.get_text())
+
+                    # Extract each page as markdown (same as indexing)
+                    for page_num in range(pdf_doc.page_count):
+                        page_md = pymupdf4llm.to_markdown(
+                            pdf_doc,
+                            pages=[page_num],
+                            write_images=False,  # Don't need images for context
+                            page_chunks=False,
+                        )
+                        text_parts.append(page_md)
+
                    pdf_doc.close()

-                    full_text = "\n".join(text_parts)
+                    # Join pages (no separator - matches indexing)
+                    full_text = "".join(text_parts)
                    logger.debug(
                        f"Extracted {len(full_text)} characters from "
-                        f"{len(text_parts)} pages in {file_path}"
+                        f"{pdf_doc.page_count} pages in {file_path}"
                    )
                    return full_text
                else:
@@ -288,6 +288,30 @@ async def _index_document(
            file_metadata = result.metadata
            title = file_metadata.get("title") or file_path.split("/")[-1]
            etag = ""  # WebDAV read_file doesn't return etag
+
+            # Diagnostic: Log page boundary information if available
+            if "page_boundaries" in file_metadata:
+                page_boundaries = file_metadata["page_boundaries"]
+                logger.info(
+                    f"Page boundaries for {file_path}: "
+                    f"{len(page_boundaries)} pages, text length: {len(content)}"
+                )
+                # Log first 3 page boundaries for debugging
+                for boundary in page_boundaries[:3]:
+                    logger.debug(
+                        f"  Page {boundary['page']}: "
+                        f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]"
+                    )
+                # Verify last boundary matches text length
+                if page_boundaries:
+                    last_boundary = page_boundaries[-1]
+                    if last_boundary["end_offset"] != len(content):
+                        logger.warning(
+                            f"Text length mismatch: content={len(content)}, "
+                            f"last_boundary_end={last_boundary['end_offset']}"
+                        )
+            else:
+                logger.debug(f"No page_boundaries in metadata for {file_path}")
        except Exception as e:
            logger.error(f"Failed to process file {file_path}: {e}")
            raise
@@ -305,6 +329,31 @@ async def _index_document(
    if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
        assign_page_numbers(chunks, file_metadata["page_boundaries"])

+        # Diagnostic: Verify page number assignment
+        assigned_count = sum(1 for c in chunks if c.page_number is not None)
+        logger.info(
+            f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks "
+            f"for {file_path}"
+        )
+
+        # Log first 3 chunks to see their page assignments
+        for i, chunk in enumerate(chunks[:3]):
+            logger.debug(
+                f"  Chunk {i}: page={chunk.page_number}, "
+                f"offsets=[{chunk.start_offset}:{chunk.end_offset}]"
+            )
+
+        # Warning if NO page numbers were assigned
+        if assigned_count == 0:
+            logger.warning(
+                f"NO page numbers assigned! "
+                f"Text length: {len(content)}, "
+                f"Chunks: {len(chunks)}, "
+                f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
+                f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
+                f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
+            )
+
    # Extract chunk texts for embedding
    chunk_texts = [chunk.text for chunk in chunks]

@@ -353,7 +402,7 @@ async def _index_document(
                    **(
                        {
                            "file_path": file_path,  # Store file path for retrieval
-                            "mime_type": file_metadata.get("content_type", ""),
+                            "mime_type": content_type,  # From WebDAV response
                            "file_size": file_metadata.get("file_size"),
                            "page_number": chunk.page_number,
                            "page_count": file_metadata.get("page_count"),
@@ -0,0 +1,119 @@
+"""Unit tests for WebDAV client."""
+
+from unittest.mock import AsyncMock
+
+import pytest
+
+from nextcloud_mcp_server.client.webdav import WebDAVClient
+
+
+@pytest.mark.unit
+async def test_find_by_tag_calls_search_files(mocker):
+    """Test that find_by_tag constructs correct search query."""
+    # Create mock HTTP client
+    mock_http_client = AsyncMock()
+
+    # Create WebDAVClient instance
+    client = WebDAVClient(mock_http_client, "testuser")
+
+    # Mock the search_files method to avoid actual HTTP calls
+    mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
+
+    # Call find_by_tag
+    await client.find_by_tag("vector-index")
+
+    # Verify search_files was called with correct parameters
+    mock_search_files.assert_called_once()
+    call_args = mock_search_files.call_args
+
+    # Check that the where_conditions contains the tag name
+    assert "vector-index" in call_args.kwargs["where_conditions"]
+    assert "<oc:tags/>" in call_args.kwargs["where_conditions"]
+    assert "<d:like>" in call_args.kwargs["where_conditions"]
+
+    # Check that tags property is requested
+    assert "tags" in call_args.kwargs["properties"]
+
+
+@pytest.mark.unit
+async def test_find_by_tag_with_scope_and_limit(mocker):
+    """Test find_by_tag passes scope and limit parameters."""
+    mock_http_client = AsyncMock()
+    client = WebDAVClient(mock_http_client, "testuser")
+
+    mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
+
+    # Call with scope and limit
+    await client.find_by_tag("test-tag", scope="Documents", limit=10)
+
+    # Verify parameters were passed through
+    call_args = mock_search_files.call_args
+    assert call_args.kwargs["scope"] == "Documents"
+    assert call_args.kwargs["limit"] == 10
+
+
+@pytest.mark.unit
+def test_parse_search_response_with_tags(mocker):
+    """Test that _parse_search_response correctly parses tags."""
+    mock_http_client = AsyncMock()
+    client = WebDAVClient(mock_http_client, "testuser")
+
+    # Mock XML response with tags (comma-separated format)
+    xml_content = b"""<?xml version="1.0"?>
+    <d:multistatus xmlns:d="DAV:" xmlns:oc="http://owncloud.org/ns">
+        <d:response>
+            <d:href>/remote.php/dav/files/testuser/Documents/test.pdf</d:href>
+            <d:propstat>
+                <d:prop>
+                    <d:displayname>test.pdf</d:displayname>
+                    <d:getcontenttype>application/pdf</d:getcontenttype>
+                    <d:getcontentlength>1024</d:getcontentlength>
+                    <d:getetag>"abc123"</d:getetag>
+                    <oc:fileid>12345</oc:fileid>
+                    <oc:tags>vector-index,important</oc:tags>
+                    <d:resourcetype/>
+                </d:prop>
+            </d:propstat>
+        </d:response>
+    </d:multistatus>"""
+
+    # Parse the response
+    results = client._parse_search_response(xml_content, scope="Documents")
+
+    # Verify tags were parsed correctly
+    assert len(results) == 1
+    assert "tags" in results[0]
+    assert results[0]["tags"] == ["vector-index", "important"]
+    assert results[0]["name"] == "test.pdf"
+    assert results[0]["content_type"] == "application/pdf"
+
+
+@pytest.mark.unit
+def test_parse_search_response_with_empty_tags(mocker):
+    """Test that _parse_search_response handles files without tags."""
+    mock_http_client = AsyncMock()
+    client = WebDAVClient(mock_http_client, "testuser")
+
+    # Mock XML response without tags
+    xml_content = b"""<?xml version="1.0"?>
+    <d:multistatus xmlns:d="DAV:" xmlns:oc="http://owncloud.org/ns">
+        <d:response>
+            <d:href>/remote.php/dav/files/testuser/Documents/test.txt</d:href>
+            <d:propstat>
+                <d:prop>
+                    <d:displayname>test.txt</d:displayname>
+                    <d:getcontenttype>text/plain</d:getcontenttype>
+                    <oc:tags/>
+                    <d:resourcetype/>
+                </d:prop>
+            </d:propstat>
+        </d:response>
+    </d:multistatus>"""
+
+    # Parse the response
+    results = client._parse_search_response(xml_content, scope="Documents")
+
+    # Verify tags field is empty list
+    assert len(results) == 1
+    assert "tags" in results[0]
+    assert results[0]["tags"] == []