diff --git a/nextcloud_mcp_server/search/context.py b/nextcloud_mcp_server/search/context.py
index bc1b8e9..4477e43 100644
--- a/nextcloud_mcp_server/search/context.py
+++ b/nextcloud_mcp_server/search/context.py
@@ -247,19 +247,32 @@ async def _fetch_document_text(
if is_pdf:
# Extract text from PDF using PyMuPDF
- import fitz # PyMuPDF
+ # IMPORTANT: Use pymupdf4llm.to_markdown() to match indexing extraction
+ # This ensures character offsets align between indexed chunks and retrieval
+ import pymupdf
+ import pymupdf4llm
logger.debug(f"Extracting text from PDF: {file_path}")
- pdf_doc = fitz.open(stream=file_content, filetype="pdf")
+ pdf_doc = pymupdf.open(stream=file_content, filetype="pdf")
text_parts = []
- for page in pdf_doc:
- text_parts.append(page.get_text())
+
+ # Extract each page as markdown (same as indexing)
+ for page_num in range(pdf_doc.page_count):
+ page_md = pymupdf4llm.to_markdown(
+ pdf_doc,
+ pages=[page_num],
+ write_images=False, # Don't need images for context
+ page_chunks=False,
+ )
+ text_parts.append(page_md)
+
pdf_doc.close()
- full_text = "\n".join(text_parts)
+ # Join pages (no separator - matches indexing)
+ full_text = "".join(text_parts)
logger.debug(
f"Extracted {len(full_text)} characters from "
- f"{len(text_parts)} pages in {file_path}"
+ f"{pdf_doc.page_count} pages in {file_path}"
)
return full_text
else:
diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py
index 63b2190..74aed47 100644
--- a/nextcloud_mcp_server/vector/processor.py
+++ b/nextcloud_mcp_server/vector/processor.py
@@ -288,6 +288,30 @@ async def _index_document(
file_metadata = result.metadata
title = file_metadata.get("title") or file_path.split("/")[-1]
etag = "" # WebDAV read_file doesn't return etag
+
+ # Diagnostic: Log page boundary information if available
+ if "page_boundaries" in file_metadata:
+ page_boundaries = file_metadata["page_boundaries"]
+ logger.info(
+ f"Page boundaries for {file_path}: "
+ f"{len(page_boundaries)} pages, text length: {len(content)}"
+ )
+ # Log first 3 page boundaries for debugging
+ for boundary in page_boundaries[:3]:
+ logger.debug(
+ f" Page {boundary['page']}: "
+ f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]"
+ )
+ # Verify last boundary matches text length
+ if page_boundaries:
+ last_boundary = page_boundaries[-1]
+ if last_boundary["end_offset"] != len(content):
+ logger.warning(
+ f"Text length mismatch: content={len(content)}, "
+ f"last_boundary_end={last_boundary['end_offset']}"
+ )
+ else:
+ logger.debug(f"No page_boundaries in metadata for {file_path}")
except Exception as e:
logger.error(f"Failed to process file {file_path}: {e}")
raise
@@ -305,6 +329,31 @@ async def _index_document(
if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
assign_page_numbers(chunks, file_metadata["page_boundaries"])
+ # Diagnostic: Verify page number assignment
+ assigned_count = sum(1 for c in chunks if c.page_number is not None)
+ logger.info(
+ f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks "
+ f"for {file_path}"
+ )
+
+ # Log first 3 chunks to see their page assignments
+ for i, chunk in enumerate(chunks[:3]):
+ logger.debug(
+ f" Chunk {i}: page={chunk.page_number}, "
+ f"offsets=[{chunk.start_offset}:{chunk.end_offset}]"
+ )
+
+ # Warning if NO page numbers were assigned
+ if assigned_count == 0:
+ logger.warning(
+ f"NO page numbers assigned! "
+ f"Text length: {len(content)}, "
+ f"Chunks: {len(chunks)}, "
+ f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
+ f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
+ f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
+ )
+
# Extract chunk texts for embedding
chunk_texts = [chunk.text for chunk in chunks]
@@ -353,7 +402,7 @@ async def _index_document(
**(
{
"file_path": file_path, # Store file path for retrieval
- "mime_type": file_metadata.get("content_type", ""),
+ "mime_type": content_type, # From WebDAV response
"file_size": file_metadata.get("file_size"),
"page_number": chunk.page_number,
"page_count": file_metadata.get("page_count"),
diff --git a/tests/unit/client/test_webdav.py b/tests/unit/client/test_webdav.py
new file mode 100644
index 0000000..be7a190
--- /dev/null
+++ b/tests/unit/client/test_webdav.py
@@ -0,0 +1,119 @@
+"""Unit tests for WebDAV client."""
+
+from unittest.mock import AsyncMock
+
+import pytest
+
+from nextcloud_mcp_server.client.webdav import WebDAVClient
+
+
+@pytest.mark.unit
+async def test_find_by_tag_calls_search_files(mocker):
+ """Test that find_by_tag constructs correct search query."""
+ # Create mock HTTP client
+ mock_http_client = AsyncMock()
+
+ # Create WebDAVClient instance
+ client = WebDAVClient(mock_http_client, "testuser")
+
+ # Mock the search_files method to avoid actual HTTP calls
+ mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
+
+ # Call find_by_tag
+ await client.find_by_tag("vector-index")
+
+ # Verify search_files was called with correct parameters
+ mock_search_files.assert_called_once()
+ call_args = mock_search_files.call_args
+
+ # Check that the where_conditions contains the tag name
+ assert "vector-index" in call_args.kwargs["where_conditions"]
+ assert "" in call_args.kwargs["where_conditions"]
+ assert "" in call_args.kwargs["where_conditions"]
+
+ # Check that tags property is requested
+ assert "tags" in call_args.kwargs["properties"]
+
+
+@pytest.mark.unit
+async def test_find_by_tag_with_scope_and_limit(mocker):
+ """Test find_by_tag passes scope and limit parameters."""
+ mock_http_client = AsyncMock()
+ client = WebDAVClient(mock_http_client, "testuser")
+
+ mock_search_files = mocker.patch.object(client, "search_files", return_value=[])
+
+ # Call with scope and limit
+ await client.find_by_tag("test-tag", scope="Documents", limit=10)
+
+ # Verify parameters were passed through
+ call_args = mock_search_files.call_args
+ assert call_args.kwargs["scope"] == "Documents"
+ assert call_args.kwargs["limit"] == 10
+
+
+@pytest.mark.unit
+def test_parse_search_response_with_tags(mocker):
+ """Test that _parse_search_response correctly parses tags."""
+ mock_http_client = AsyncMock()
+ client = WebDAVClient(mock_http_client, "testuser")
+
+ # Mock XML response with tags (comma-separated format)
+ xml_content = b"""
+
+
+ /remote.php/dav/files/testuser/Documents/test.pdf
+
+
+ test.pdf
+ application/pdf
+ 1024
+ "abc123"
+ 12345
+ vector-index,important
+
+
+
+
+ """
+
+ # Parse the response
+ results = client._parse_search_response(xml_content, scope="Documents")
+
+ # Verify tags were parsed correctly
+ assert len(results) == 1
+ assert "tags" in results[0]
+ assert results[0]["tags"] == ["vector-index", "important"]
+ assert results[0]["name"] == "test.pdf"
+ assert results[0]["content_type"] == "application/pdf"
+
+
+@pytest.mark.unit
+def test_parse_search_response_with_empty_tags(mocker):
+ """Test that _parse_search_response handles files without tags."""
+ mock_http_client = AsyncMock()
+ client = WebDAVClient(mock_http_client, "testuser")
+
+ # Mock XML response without tags
+ xml_content = b"""
+
+
+ /remote.php/dav/files/testuser/Documents/test.txt
+
+
+ test.txt
+ text/plain
+
+
+
+
+
+ """
+
+ # Parse the response
+ results = client._parse_search_response(xml_content, scope="Documents")
+
+ # Verify tags field is empty list
+ assert len(results) == 1
+ assert "tags" in results[0]
+ assert results[0]["tags"] == []