fix: Align PDF text extraction between indexing and context expansion
This commit fixes two critical issues with PDF processing:
1. **Text extraction mismatch (context expansion bug)**:
- Indexing used pymupdf4llm.to_markdown() producing markdown text
- Context expansion used page.get_text() producing plain text
- Different text formats caused character offset misalignment
- Search would find correct chunk, but expansion showed wrong section
- Fixed by making context.py use pymupdf4llm.to_markdown() consistently
2. **Diagnostic logging for page number assignment**:
- Added logging to verify page_boundaries exist in metadata
- Added logging to verify assign_page_numbers() assigns values
- Helps diagnose why page numbers show as null in search results
3. **mime_type storage bug**:
- Fixed incorrect field reference in processor.py:405
- Was using file_metadata.get("content_type", "")
- Should use content_type from WebDAV response
Changes:
- nextcloud_mcp_server/search/context.py: Use pymupdf4llm.to_markdown()
for PDF text extraction to match indexing method
- nextcloud_mcp_server/vector/processor.py: Add diagnostic logging for
page boundaries and assignment, fix mime_type storage
- tests/unit/client/test_webdav.py: Fix import sorting
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -247,19 +247,32 @@ async def _fetch_document_text(
|
||||
|
||||
if is_pdf:
|
||||
# Extract text from PDF using PyMuPDF
|
||||
import fitz # PyMuPDF
|
||||
# IMPORTANT: Use pymupdf4llm.to_markdown() to match indexing extraction
|
||||
# This ensures character offsets align between indexed chunks and retrieval
|
||||
import pymupdf
|
||||
import pymupdf4llm
|
||||
|
||||
logger.debug(f"Extracting text from PDF: {file_path}")
|
||||
pdf_doc = fitz.open(stream=file_content, filetype="pdf")
|
||||
pdf_doc = pymupdf.open(stream=file_content, filetype="pdf")
|
||||
text_parts = []
|
||||
for page in pdf_doc:
|
||||
text_parts.append(page.get_text())
|
||||
|
||||
# Extract each page as markdown (same as indexing)
|
||||
for page_num in range(pdf_doc.page_count):
|
||||
page_md = pymupdf4llm.to_markdown(
|
||||
pdf_doc,
|
||||
pages=[page_num],
|
||||
write_images=False, # Don't need images for context
|
||||
page_chunks=False,
|
||||
)
|
||||
text_parts.append(page_md)
|
||||
|
||||
pdf_doc.close()
|
||||
|
||||
full_text = "\n".join(text_parts)
|
||||
# Join pages (no separator - matches indexing)
|
||||
full_text = "".join(text_parts)
|
||||
logger.debug(
|
||||
f"Extracted {len(full_text)} characters from "
|
||||
f"{len(text_parts)} pages in {file_path}"
|
||||
f"{pdf_doc.page_count} pages in {file_path}"
|
||||
)
|
||||
return full_text
|
||||
else:
|
||||
|
||||
@@ -288,6 +288,30 @@ async def _index_document(
|
||||
file_metadata = result.metadata
|
||||
title = file_metadata.get("title") or file_path.split("/")[-1]
|
||||
etag = "" # WebDAV read_file doesn't return etag
|
||||
|
||||
# Diagnostic: Log page boundary information if available
|
||||
if "page_boundaries" in file_metadata:
|
||||
page_boundaries = file_metadata["page_boundaries"]
|
||||
logger.info(
|
||||
f"Page boundaries for {file_path}: "
|
||||
f"{len(page_boundaries)} pages, text length: {len(content)}"
|
||||
)
|
||||
# Log first 3 page boundaries for debugging
|
||||
for boundary in page_boundaries[:3]:
|
||||
logger.debug(
|
||||
f" Page {boundary['page']}: "
|
||||
f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]"
|
||||
)
|
||||
# Verify last boundary matches text length
|
||||
if page_boundaries:
|
||||
last_boundary = page_boundaries[-1]
|
||||
if last_boundary["end_offset"] != len(content):
|
||||
logger.warning(
|
||||
f"Text length mismatch: content={len(content)}, "
|
||||
f"last_boundary_end={last_boundary['end_offset']}"
|
||||
)
|
||||
else:
|
||||
logger.debug(f"No page_boundaries in metadata for {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process file {file_path}: {e}")
|
||||
raise
|
||||
@@ -305,6 +329,31 @@ async def _index_document(
|
||||
if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
|
||||
assign_page_numbers(chunks, file_metadata["page_boundaries"])
|
||||
|
||||
# Diagnostic: Verify page number assignment
|
||||
assigned_count = sum(1 for c in chunks if c.page_number is not None)
|
||||
logger.info(
|
||||
f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks "
|
||||
f"for {file_path}"
|
||||
)
|
||||
|
||||
# Log first 3 chunks to see their page assignments
|
||||
for i, chunk in enumerate(chunks[:3]):
|
||||
logger.debug(
|
||||
f" Chunk {i}: page={chunk.page_number}, "
|
||||
f"offsets=[{chunk.start_offset}:{chunk.end_offset}]"
|
||||
)
|
||||
|
||||
# Warning if NO page numbers were assigned
|
||||
if assigned_count == 0:
|
||||
logger.warning(
|
||||
f"NO page numbers assigned! "
|
||||
f"Text length: {len(content)}, "
|
||||
f"Chunks: {len(chunks)}, "
|
||||
f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
|
||||
f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
|
||||
f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
|
||||
)
|
||||
|
||||
# Extract chunk texts for embedding
|
||||
chunk_texts = [chunk.text for chunk in chunks]
|
||||
|
||||
@@ -353,7 +402,7 @@ async def _index_document(
|
||||
**(
|
||||
{
|
||||
"file_path": file_path, # Store file path for retrieval
|
||||
"mime_type": file_metadata.get("content_type", ""),
|
||||
"mime_type": content_type, # From WebDAV response
|
||||
"file_size": file_metadata.get("file_size"),
|
||||
"page_number": chunk.page_number,
|
||||
"page_count": file_metadata.get("page_count"),
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
"""Unit tests for WebDAV client."""
|
||||
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from nextcloud_mcp_server.client.webdav import WebDAVClient
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
async def test_find_by_tag_calls_search_files(mocker):
|
||||
"""Test that find_by_tag constructs correct search query."""
|
||||
# Create mock HTTP client
|
||||
mock_http_client = AsyncMock()
|
||||
|
||||
# Create WebDAVClient instance
|
||||
client = WebDAVClient(mock_http_client, "testuser")
|
||||
|
||||
# Mock the search_files method to avoid actual HTTP calls
|
||||
mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
|
||||
|
||||
# Call find_by_tag
|
||||
await client.find_by_tag("vector-index")
|
||||
|
||||
# Verify search_files was called with correct parameters
|
||||
mock_search_files.assert_called_once()
|
||||
call_args = mock_search_files.call_args
|
||||
|
||||
# Check that the where_conditions contains the tag name
|
||||
assert "vector-index" in call_args.kwargs["where_conditions"]
|
||||
assert "<oc:tags/>" in call_args.kwargs["where_conditions"]
|
||||
assert "<d:like>" in call_args.kwargs["where_conditions"]
|
||||
|
||||
# Check that tags property is requested
|
||||
assert "tags" in call_args.kwargs["properties"]
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
async def test_find_by_tag_with_scope_and_limit(mocker):
|
||||
"""Test find_by_tag passes scope and limit parameters."""
|
||||
mock_http_client = AsyncMock()
|
||||
client = WebDAVClient(mock_http_client, "testuser")
|
||||
|
||||
mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
|
||||
|
||||
# Call with scope and limit
|
||||
await client.find_by_tag("test-tag", scope="Documents", limit=10)
|
||||
|
||||
# Verify parameters were passed through
|
||||
call_args = mock_search_files.call_args
|
||||
assert call_args.kwargs["scope"] == "Documents"
|
||||
assert call_args.kwargs["limit"] == 10
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_parse_search_response_with_tags(mocker):
|
||||
"""Test that _parse_search_response correctly parses tags."""
|
||||
mock_http_client = AsyncMock()
|
||||
client = WebDAVClient(mock_http_client, "testuser")
|
||||
|
||||
# Mock XML response with tags (comma-separated format)
|
||||
xml_content = b"""<?xml version="1.0"?>
|
||||
<d:multistatus xmlns:d="DAV:" xmlns:oc="http://owncloud.org/ns">
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/Documents/test.pdf</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>test.pdf</d:displayname>
|
||||
<d:getcontenttype>application/pdf</d:getcontenttype>
|
||||
<d:getcontentlength>1024</d:getcontentlength>
|
||||
<d:getetag>"abc123"</d:getetag>
|
||||
<oc:fileid>12345</oc:fileid>
|
||||
<oc:tags>vector-index,important</oc:tags>
|
||||
<d:resourcetype/>
|
||||
</d:prop>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
</d:multistatus>"""
|
||||
|
||||
# Parse the response
|
||||
results = client._parse_search_response(xml_content, scope="Documents")
|
||||
|
||||
# Verify tags were parsed correctly
|
||||
assert len(results) == 1
|
||||
assert "tags" in results[0]
|
||||
assert results[0]["tags"] == ["vector-index", "important"]
|
||||
assert results[0]["name"] == "test.pdf"
|
||||
assert results[0]["content_type"] == "application/pdf"
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_parse_search_response_with_empty_tags(mocker):
|
||||
"""Test that _parse_search_response handles files without tags."""
|
||||
mock_http_client = AsyncMock()
|
||||
client = WebDAVClient(mock_http_client, "testuser")
|
||||
|
||||
# Mock XML response without tags
|
||||
xml_content = b"""<?xml version="1.0"?>
|
||||
<d:multistatus xmlns:d="DAV:" xmlns:oc="http://owncloud.org/ns">
|
||||
<d:response>
|
||||
<d:href>/remote.php/dav/files/testuser/Documents/test.txt</d:href>
|
||||
<d:propstat>
|
||||
<d:prop>
|
||||
<d:displayname>test.txt</d:displayname>
|
||||
<d:getcontenttype>text/plain</d:getcontenttype>
|
||||
<oc:tags/>
|
||||
<d:resourcetype/>
|
||||
</d:prop>
|
||||
</d:propstat>
|
||||
</d:response>
|
||||
</d:multistatus>"""
|
||||
|
||||
# Parse the response
|
||||
results = client._parse_search_response(xml_content, scope="Documents")
|
||||
|
||||
# Verify tags field is empty list
|
||||
assert len(results) == 1
|
||||
assert "tags" in results[0]
|
||||
assert results[0]["tags"] == []
|
||||
Reference in New Issue
Block a user