diff --git a/nextcloud_mcp_server/app.py b/nextcloud_mcp_server/app.py
index 56fd4f5..f532bd4 100644
--- a/nextcloud_mcp_server/app.py
+++ b/nextcloud_mcp_server/app.py
@@ -122,6 +122,26 @@ def initialize_document_processors():
except Exception as e:
logger.warning(f"Failed to register Tesseract processor: {e}")
+ # Register PyMuPDF processor (high priority, local, no API required)
+ if "pymupdf" in config["processors"]:
+ pymupdf_config = config["processors"]["pymupdf"]
+ try:
+ from nextcloud_mcp_server.document_processors.pymupdf import (
+ PyMuPDFProcessor,
+ )
+
+ processor = PyMuPDFProcessor(
+ extract_images=pymupdf_config.get("extract_images", True),
+ image_dir=pymupdf_config.get("image_dir"),
+ )
+ registry.register(processor, priority=15) # Higher than unstructured
+ logger.info(
+ f"Registered PyMuPDF processor: extract_images={pymupdf_config.get('extract_images', True)}"
+ )
+ registered_count += 1
+ except Exception as e:
+ logger.warning(f"Failed to register PyMuPDF processor: {e}")
+
# Register custom processor
if "custom" in config["processors"]:
custom_config = config["processors"]["custom"]
diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py
index 3497084..e1f2e4e 100644
--- a/nextcloud_mcp_server/auth/viz_routes.py
+++ b/nextcloud_mcp_server/auth/viz_routes.py
@@ -272,7 +272,9 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
doc_chunks = defaultdict(list)
for point in points:
if point.payload:
- doc_id = int(point.payload.get("doc_id", 0))
+ # doc_id can be int (for notes) or str (for files - file path)
+ # Keep original type instead of forcing to int
+ doc_id = point.payload.get("doc_id", 0)
vector = extract_dense_vector(point)
if vector is not None:
doc_chunks[doc_id].append(vector)
diff --git a/nextcloud_mcp_server/client/__init__.py b/nextcloud_mcp_server/client/__init__.py
index 29dfc36..d6e9f24 100644
--- a/nextcloud_mcp_server/client/__init__.py
+++ b/nextcloud_mcp_server/client/__init__.py
@@ -130,6 +130,62 @@ class NextcloudClient:
all_notes = self.notes.get_all_notes()
return await self._notes_search.search_notes(all_notes, query)
+ async def find_files_by_tag(
+ self, tag_name: str, mime_type_filter: str | None = None
+ ) -> list[dict]:
+ """Find files by system tag name, optionally filtered by MIME type.
+
+ This method coordinates tag lookup and file retrieval via WebDAV:
+ 1. Look up the tag ID by name
+ 2. Get all files with that tag (via REPORT with full metadata)
+ 3. Optionally filter by MIME type
+
+ Args:
+ tag_name: Name of the system tag to search for (e.g., "vector-index")
+ mime_type_filter: Optional MIME type filter (e.g., "application/pdf")
+
+ Returns:
+ List of file dictionaries with WebDAV properties (path, size, content_type, etc.)
+
+ Raises:
+ RuntimeError: If tag lookup or file query fails
+
+ Examples:
+ # Find all files with "vector-index" tag
+ files = await nc_client.find_files_by_tag("vector-index")
+
+ # Find only PDFs with the tag
+ pdfs = await nc_client.find_files_by_tag("vector-index", "application/pdf")
+ """
+ # Look up tag by name using WebDAV
+ tag = await self.webdav.get_tag_by_name(tag_name)
+ if not tag:
+ logger.debug(f"Tag '{tag_name}' not found, returning empty list")
+ return []
+
+ # Get files with this tag (returns full file info from REPORT)
+ files = await self.webdav.get_files_by_tag(tag["id"])
+ if not files:
+ logger.debug(f"No files found with tag '{tag_name}'")
+ return []
+
+ logger.debug(f"Found {len(files)} files with tag '{tag_name}'")
+
+ # Apply MIME type filter if specified
+ if mime_type_filter:
+ filtered_files = [
+ f
+ for f in files
+ if f.get("content_type", "").startswith(mime_type_filter)
+ ]
+ logger.info(
+ f"Returning {len(filtered_files)} files with tag '{tag_name}' (filtered by {mime_type_filter})"
+ )
+ return filtered_files
+
+ logger.info(f"Returning {len(files)} files with tag '{tag_name}'")
+ return files
+
def _get_webdav_base_path(self) -> str:
"""Helper to get the base WebDAV path for the authenticated user."""
return f"/remote.php/dav/files/{self.username}"
diff --git a/nextcloud_mcp_server/client/webdav.py b/nextcloud_mcp_server/client/webdav.py
index c877e38..05f27df 100644
--- a/nextcloud_mcp_server/client/webdav.py
+++ b/nextcloud_mcp_server/client/webdav.py
@@ -821,6 +821,20 @@ class WebDAVClient(BaseNextcloudClient):
item["file_id"] = int(value) if value else None
elif tag == "favorite":
item["is_favorite"] = value == "1"
+ elif tag == "tags":
+ # Tags can be comma-separated or have multiple child elements
+ if value:
+ # Handle comma-separated tags
+ item["tags"] = [
+ t.strip() for t in value.split(",") if t.strip()
+ ]
+ else:
+ # Check for child tag elements (alternative format)
+ tag_elements = child.findall(".//{http://owncloud.org/ns}tag")
+ if tag_elements:
+ item["tags"] = [t.text for t in tag_elements if t.text]
+ else:
+ item["tags"] = []
elif tag == "permissions":
item["permissions"] = value
elif tag == "size":
@@ -948,3 +962,336 @@ class WebDAVClient(BaseNextcloudClient):
properties=properties,
limit=limit,
)
+
+ async def find_by_tag(
+ self, tag_name: str, scope: str = "", limit: Optional[int] = None
+ ) -> List[Dict[str, Any]]:
+ """Find files by tag name.
+
+ DEPRECATED: Use NextcloudClient.find_files_by_tag() instead, which uses
+ the proper OCS Tags API rather than WebDAV SEARCH.
+
+ Args:
+ tag_name: Tag to filter by (e.g., "vector-index")
+ scope: Directory path to search in (empty string for user root)
+ limit: Maximum number of results to return
+
+ Returns:
+ List of files/directories with the specified tag
+
+ Examples:
+ # Find all files tagged with "vector-index"
+ results = await find_by_tag("vector-index")
+
+ # Find tagged files in a specific folder
+ results = await find_by_tag("vector-index", scope="Documents")
+ """
+ # Use LIKE for tag matching since tags can be comma-separated
+ where_conditions = f"""
+
+
+
+
+ %{tag_name}%
+
+ """
+
+ # Request tag property along with standard properties
+ properties = [
+ "displayname",
+ "getcontentlength",
+ "getcontenttype",
+ "getlastmodified",
+ "resourcetype",
+ "getetag",
+ "fileid",
+ "tags",
+ ]
+
+ return await self.search_files(
+ scope=scope,
+ where_conditions=where_conditions,
+ properties=properties,
+ limit=limit,
+ )
+
+ async def _get_file_info_by_id(self, file_id: int) -> Dict[str, Any]:
+ """Get file information by Nextcloud file ID using WebDAV.
+
+ Args:
+ file_id: Nextcloud internal file ID
+
+ Returns:
+ File information dictionary with path, size, content_type, etc.
+
+ Raises:
+ HTTPStatusError: If file not found or request fails
+ """
+ # Nextcloud allows accessing files by ID via special meta endpoint
+ meta_path = f"/remote.php/dav/meta/{file_id}/"
+
+ propfind_body = """
+
+
+
+
+
+
+
+
+
+
+ """
+
+ headers = {"Depth": "0", "Content-Type": "text/xml", "OCS-APIRequest": "true"}
+
+ response = await self._make_request(
+ "PROPFIND", meta_path, content=propfind_body, headers=headers
+ )
+ response.raise_for_status()
+
+ # Parse the XML response
+ root = ET.fromstring(response.content)
+ responses = root.findall(".//{DAV:}response")
+
+ if not responses:
+ raise RuntimeError(f"File ID {file_id} not found")
+
+ response_elem = responses[0]
+ href = response_elem.find(".//{DAV:}href")
+ if href is None:
+ raise RuntimeError(f"No href in response for file ID {file_id}")
+
+ propstat = response_elem.find(".//{DAV:}propstat")
+ if propstat is None:
+ raise RuntimeError(f"No propstat for file ID {file_id}")
+
+ prop = propstat.find(".//{DAV:}prop")
+ if prop is None:
+ raise RuntimeError(f"No prop for file ID {file_id}")
+
+ # Extract file path from displayname or construct from file ID
+ displayname_elem = prop.find(".//{DAV:}displayname")
+ name = (
+ displayname_elem.text if displayname_elem is not None else f"file_{file_id}"
+ )
+
+ # Get file properties
+ size_elem = prop.find(".//{DAV:}getcontentlength")
+ size = int(size_elem.text) if size_elem is not None and size_elem.text else 0
+
+ content_type_elem = prop.find(".//{DAV:}getcontenttype")
+ content_type = content_type_elem.text if content_type_elem is not None else None
+
+ modified_elem = prop.find(".//{DAV:}getlastmodified")
+ modified = modified_elem.text if modified_elem is not None else None
+
+ etag_elem = prop.find(".//{DAV:}getetag")
+ etag = (
+ etag_elem.text.strip('"')
+ if etag_elem is not None and etag_elem.text
+ else None
+ )
+
+ # Check if it's a directory
+ resourcetype = prop.find(".//{DAV:}resourcetype")
+ is_directory = (
+ resourcetype is not None
+ and resourcetype.find(".//{DAV:}collection") is not None
+ )
+
+ # Try to get actual file path - meta endpoint doesn't give us the real path
+ # so we'll construct a reasonable path from the name
+ # The calling code in NextcloudClient will have the context to determine the actual path
+ file_info = {
+ "name": name,
+ "path": f"/{name}", # Placeholder - caller should use WebDAV to get real path if needed
+ "size": size,
+ "content_type": content_type,
+ "last_modified": modified,
+ "etag": etag,
+ "is_directory": is_directory,
+ "file_id": file_id,
+ }
+
+ logger.debug(f"Retrieved file info for ID {file_id}: {name}")
+ return file_info
+
+ async def get_tag_by_name(self, tag_name: str) -> dict[str, Any] | None:
+ """Get a system tag by its name via WebDAV.
+
+ Args:
+ tag_name: Name of the tag to find (case-sensitive)
+
+ Returns:
+ Tag dictionary if found, None otherwise
+ """
+ # Use WebDAV PROPFIND to list all systemtags
+ propfind_body = """
+
+
+
+
+
+
+
+"""
+
+ response = await self._client.request(
+ "PROPFIND",
+ "/remote.php/dav/systemtags/",
+ headers={"Depth": "1"},
+ content=propfind_body,
+ )
+ response.raise_for_status()
+
+ # Parse XML response
+ root = ET.fromstring(response.content)
+ ns = {
+ "d": "DAV:",
+ "oc": "http://owncloud.org/ns",
+ }
+
+ for response_elem in root.findall("d:response", ns):
+ href = response_elem.find("d:href", ns)
+ if href is None or href.text == "/remote.php/dav/systemtags/":
+ # Skip the collection itself
+ continue
+
+ propstat = response_elem.find("d:propstat", ns)
+ if propstat is None:
+ continue
+
+ prop = propstat.find("d:prop", ns)
+ if prop is None:
+ continue
+
+ # Extract tag properties
+ tag_id_elem = prop.find("oc:id", ns)
+ display_name_elem = prop.find("oc:display-name", ns)
+ user_visible_elem = prop.find("oc:user-visible", ns)
+ user_assignable_elem = prop.find("oc:user-assignable", ns)
+
+ if display_name_elem is not None and display_name_elem.text == tag_name:
+ tag_info = {
+ "id": int(tag_id_elem.text) if tag_id_elem is not None else None,
+ "name": display_name_elem.text,
+ "userVisible": user_visible_elem.text.lower() == "true"
+ if user_visible_elem is not None
+ else True,
+ "userAssignable": user_assignable_elem.text.lower() == "true"
+ if user_assignable_elem is not None
+ else True,
+ }
+ logger.debug(f"Found tag '{tag_name}' with ID {tag_info['id']}")
+ return tag_info
+
+ logger.debug(f"Tag '{tag_name}' not found")
+ return None
+
+ async def get_files_by_tag(self, tag_id: int) -> list[dict[str, Any]]:
+ """Get all files tagged with a specific system tag via WebDAV REPORT.
+
+ Args:
+ tag_id: Numeric ID of the tag
+
+ Returns:
+ List of file info dictionaries with path, size, content_type, etc.
+ """
+ # Use WebDAV REPORT method with systemtag filter, requesting all properties
+ report_body = f"""
+
+
+
+
+
+
+
+
+
+
+ {tag_id}
+
+"""
+
+ response = await self._client.request(
+ "REPORT",
+ f"{self._get_webdav_base_path()}/",
+ content=report_body,
+ )
+ response.raise_for_status()
+
+ # Parse XML response
+ root = ET.fromstring(response.content)
+ ns = {
+ "d": "DAV:",
+ "oc": "http://owncloud.org/ns",
+ }
+
+ files = []
+ for response_elem in root.findall("d:response", ns):
+ # Extract href (file path)
+ href_elem = response_elem.find("d:href", ns)
+ if href_elem is None or not href_elem.text:
+ continue
+
+ propstat = response_elem.find("d:propstat", ns)
+ if propstat is None:
+ continue
+
+ prop = propstat.find("d:prop", ns)
+ if prop is None:
+ continue
+
+ # Extract all properties
+ fileid_elem = prop.find("oc:fileid", ns)
+ displayname_elem = prop.find("d:displayname", ns)
+ contentlength_elem = prop.find("d:getcontentlength", ns)
+ contenttype_elem = prop.find("d:getcontenttype", ns)
+ lastmodified_elem = prop.find("d:getlastmodified", ns)
+ etag_elem = prop.find("d:getetag", ns)
+
+ if fileid_elem is None or not fileid_elem.text:
+ continue
+
+ # Decode href path and extract the file path
+ from urllib.parse import unquote
+
+ href_path = unquote(href_elem.text)
+ # Remove WebDAV prefix to get user-relative path
+ webdav_prefix = f"/remote.php/dav/files/{self.username}/"
+ file_path = href_path.replace(webdav_prefix, "/")
+
+ # Parse last modified timestamp
+ last_modified_timestamp = None
+ if lastmodified_elem is not None and lastmodified_elem.text:
+ from email.utils import parsedate_to_datetime
+
+ try:
+ dt = parsedate_to_datetime(lastmodified_elem.text)
+ last_modified_timestamp = int(dt.timestamp())
+ except Exception:
+ pass
+
+ file_info = {
+ "id": int(fileid_elem.text),
+ "path": file_path,
+ "name": displayname_elem.text
+ if displayname_elem is not None
+ else file_path.split("/")[-1],
+ "size": int(contentlength_elem.text)
+ if contentlength_elem is not None and contentlength_elem.text
+ else 0,
+ "content_type": contenttype_elem.text
+ if contenttype_elem is not None
+ else "",
+ "last_modified": lastmodified_elem.text
+ if lastmodified_elem is not None
+ else None,
+ "last_modified_timestamp": last_modified_timestamp,
+ "etag": etag_elem.text if etag_elem is not None else None,
+ }
+ files.append(file_info)
+
+ logger.debug(f"Found {len(files)} files with tag ID {tag_id}")
+ return files
diff --git a/nextcloud_mcp_server/config.py b/nextcloud_mcp_server/config.py
index b81d86c..d64e6a2 100644
--- a/nextcloud_mcp_server/config.py
+++ b/nextcloud_mcp_server/config.py
@@ -102,6 +102,14 @@ def get_document_processor_config() -> dict[str, Any]:
"lang": os.getenv("TESSERACT_LANG", "eng"),
}
+ # PyMuPDF configuration (local PDF processing)
+ if os.getenv("ENABLE_PYMUPDF", "true").lower() == "true": # Enabled by default
+ config["processors"]["pymupdf"] = {
+ "extract_images": os.getenv("PYMUPDF_EXTRACT_IMAGES", "true").lower()
+ == "true",
+ "image_dir": os.getenv("PYMUPDF_IMAGE_DIR"), # None = use temp directory
+ }
+
# Custom processor (via HTTP API)
if os.getenv("ENABLE_CUSTOM_PROCESSOR", "false").lower() == "true":
custom_url = os.getenv("CUSTOM_PROCESSOR_URL")
diff --git a/nextcloud_mcp_server/document_processors/__init__.py b/nextcloud_mcp_server/document_processors/__init__.py
index 9d5636b..1997883 100644
--- a/nextcloud_mcp_server/document_processors/__init__.py
+++ b/nextcloud_mcp_server/document_processors/__init__.py
@@ -1,12 +1,18 @@
"""Document processing plugins for extracting text from various file formats."""
from .base import DocumentProcessor, ProcessingResult, ProcessorError
+from .pymupdf import PyMuPDFProcessor
from .registry import ProcessorRegistry, get_registry
+# Register processors at module initialization
+_registry = get_registry()
+_registry.register(PyMuPDFProcessor(), priority=10)
+
__all__ = [
"DocumentProcessor",
"ProcessingResult",
"ProcessorError",
"ProcessorRegistry",
"get_registry",
+ "PyMuPDFProcessor",
]
diff --git a/nextcloud_mcp_server/document_processors/pymupdf.py b/nextcloud_mcp_server/document_processors/pymupdf.py
new file mode 100644
index 0000000..8d5c7e6
--- /dev/null
+++ b/nextcloud_mcp_server/document_processors/pymupdf.py
@@ -0,0 +1,285 @@
+"""Document processor using PyMuPDF (fitz) library."""
+
+import logging
+import pathlib
+import tempfile
+from collections.abc import Awaitable, Callable
+from typing import Any, Optional
+
+import pymupdf
+import pymupdf.layout
+import pymupdf4llm
+
+from .base import DocumentProcessor, ProcessingResult, ProcessorError
+
+# Activate layout analysis for better text extraction
+pymupdf.layout.activate()
+
+logger = logging.getLogger(__name__)
+
+
+class PyMuPDFProcessor(DocumentProcessor):
+ """Document processor using PyMuPDF library for PDF processing.
+
+ PyMuPDF (fitz) is a fast, local PDF processing library that extracts text,
+ metadata, and images without requiring external API calls.
+
+ Features:
+ - Fast text extraction with layout preservation
+ - PDF metadata extraction (title, author, creation date, page count)
+ - Image extraction for future multimodal support
+ - Page number tracking for precise citations
+ """
+
+ SUPPORTED_TYPES = {
+ "application/pdf",
+ }
+
+ def __init__(
+ self,
+ extract_images: bool = True,
+ image_dir: Optional[str | pathlib.Path] = None,
+ ):
+ """Initialize PyMuPDF processor.
+
+ Args:
+ extract_images: Whether to extract embedded images from PDFs
+ image_dir: Directory to store extracted images (defaults to temp directory)
+ """
+ self.extract_images = extract_images
+
+ if image_dir is None:
+ self.image_dir = pathlib.Path(tempfile.gettempdir()) / "pdf-images"
+ else:
+ self.image_dir = pathlib.Path(image_dir)
+
+ # Create image directory if it doesn't exist
+ if self.extract_images:
+ self.image_dir.mkdir(exist_ok=True, parents=True)
+ logger.info(
+ f"Initialized PyMuPDFProcessor with image extraction to {self.image_dir}"
+ )
+ else:
+ logger.info("Initialized PyMuPDFProcessor without image extraction")
+
+ @property
+ def name(self) -> str:
+ return "pymupdf"
+
+ @property
+ def supported_mime_types(self) -> set[str]:
+ return self.SUPPORTED_TYPES
+
+ async def process(
+ self,
+ content: bytes,
+ content_type: str,
+ filename: Optional[str] = None,
+ options: Optional[dict[str, Any]] = None,
+ progress_callback: Optional[
+ Callable[[float, Optional[float], Optional[str]], Awaitable[None]]
+ ] = None,
+ ) -> ProcessingResult:
+ """Process a PDF document and extract text, metadata, and images.
+
+ Args:
+ content: PDF document bytes
+ content_type: MIME type (should be application/pdf)
+ filename: Optional filename for better error messages
+ options: Processing options (currently unused)
+ progress_callback: Optional callback for progress updates
+
+ Returns:
+ ProcessingResult with extracted text and metadata
+
+ Raises:
+ ProcessorError: If PDF processing fails
+ """
+ import anyio
+
+ try:
+ if progress_callback:
+ await progress_callback(0, 100, "Processing PDF in background thread")
+
+ # Run CPU-bound PDF processing in thread pool to avoid blocking event loop
+ result = await anyio.to_thread.run_sync(
+ self._process_sync,
+ content,
+ filename,
+ )
+
+ if progress_callback:
+ await progress_callback(100, 100, "Processing complete")
+
+ return result
+
+ except Exception as e:
+ error_msg = f"Failed to process PDF {filename or ''}: {e}"
+ logger.error(error_msg, exc_info=True)
+ raise ProcessorError(error_msg) from e
+
+ def _process_sync(
+ self,
+ content: bytes,
+ filename: Optional[str] = None,
+ ) -> ProcessingResult:
+ """Synchronous PDF processing (runs in thread pool).
+
+ Args:
+ content: PDF document bytes
+ filename: Optional filename for better error messages
+
+ Returns:
+ ProcessingResult with extracted text and metadata
+
+ Raises:
+ Exception: If PDF processing fails
+ """
+ # Open PDF from bytes
+ doc = pymupdf.open("pdf", content)
+
+ # Extract metadata from PDF
+ metadata = self._extract_metadata(doc, filename)
+
+ # Add file size to metadata
+ metadata["file_size"] = len(content)
+
+ # Extract text page-by-page to preserve page boundaries
+ # pymupdf.layout.activate() causes page_chunks=True to return a string,
+ # so we manually extract text per page instead.
+ page_boundaries = []
+ current_offset = 0
+ full_text_parts = []
+ image_paths = []
+
+ for page_num in range(doc.page_count):
+ if self.extract_images:
+ # Generate unique directory for this PDF's images
+ pdf_id = filename.replace("/", "_") if filename else "unknown"
+ pdf_image_dir = self.image_dir / pdf_id
+ pdf_image_dir.mkdir(exist_ok=True, parents=True)
+
+ # Extract page as markdown with images
+ page_md = pymupdf4llm.to_markdown(
+ doc,
+ pages=[page_num], # Extract single page
+ write_images=True,
+ image_path=pdf_image_dir,
+ page_chunks=False, # Single page, no chunking needed
+ )
+
+ # Collect image paths
+ if pdf_image_dir.exists():
+ page_images = [str(p) for p in pdf_image_dir.glob("*")]
+ image_paths.extend(page_images)
+ else:
+ # Extract page as markdown without images
+ page_md = pymupdf4llm.to_markdown(
+ doc,
+ pages=[page_num], # Extract single page
+ write_images=False,
+ page_chunks=False, # Single page, no chunking needed
+ )
+
+ # Store page text
+ full_text_parts.append(page_md)
+
+ # Store boundary info: {page (1-indexed), start, end}
+ page_boundaries.append(
+ {
+ "page": page_num + 1, # Convert to 1-indexed
+ "start_offset": current_offset,
+ "end_offset": current_offset + len(page_md),
+ }
+ )
+
+ current_offset += len(page_md)
+
+ # Join all page texts
+ md_text = "".join(full_text_parts)
+
+ # Store image metadata
+ metadata["has_images"] = len(image_paths) > 0
+ if image_paths:
+ metadata["image_count"] = len(image_paths)
+ metadata["image_paths"] = image_paths
+
+ # Add page boundaries to metadata for chunker to use
+ metadata["page_boundaries"] = page_boundaries
+
+ # Close the document
+ doc.close()
+
+ logger.info(
+ f"Successfully processed PDF {filename or ''}: "
+ f"{metadata['page_count']} pages, {len(md_text)} chars, "
+ f"{metadata.get('image_count', 0)} images"
+ )
+
+ return ProcessingResult(
+ text=md_text,
+ metadata=metadata,
+ processor=self.name,
+ success=True,
+ )
+
+ def _extract_metadata(
+ self, doc: pymupdf.Document, filename: Optional[str]
+ ) -> dict[str, Any]:
+ """Extract metadata from PDF document.
+
+ Args:
+ doc: Opened PyMuPDF document
+ filename: Optional filename
+
+ Returns:
+ Dictionary with PDF metadata
+ """
+ metadata: dict[str, Any] = {}
+
+ # Basic document info
+ metadata["page_count"] = doc.page_count
+ metadata["format"] = "PDF 1." + str(
+ doc.pdf_version() if hasattr(doc, "pdf_version") else "?"
+ )
+
+ if filename:
+ metadata["filename"] = filename
+
+ # Extract PDF metadata dictionary
+ pdf_metadata = doc.metadata
+ if pdf_metadata:
+ # Standard PDF metadata fields
+ if pdf_metadata.get("title"):
+ metadata["title"] = pdf_metadata["title"]
+ if pdf_metadata.get("author"):
+ metadata["author"] = pdf_metadata["author"]
+ if pdf_metadata.get("subject"):
+ metadata["subject"] = pdf_metadata["subject"]
+ if pdf_metadata.get("keywords"):
+ metadata["keywords"] = pdf_metadata["keywords"]
+ if pdf_metadata.get("creator"):
+ metadata["creator"] = pdf_metadata["creator"]
+ if pdf_metadata.get("producer"):
+ metadata["producer"] = pdf_metadata["producer"]
+ if pdf_metadata.get("creationDate"):
+ metadata["creation_date"] = pdf_metadata["creationDate"]
+ if pdf_metadata.get("modDate"):
+ metadata["modification_date"] = pdf_metadata["modDate"]
+
+ return metadata
+
+ async def health_check(self) -> bool:
+ """Check if PyMuPDF is available and working.
+
+ Returns:
+ True if processor is ready to use
+ """
+ try:
+ # Try to create a simple PDF in memory
+ test_doc = pymupdf.open()
+ test_doc.close()
+ return True
+ except Exception as e:
+ logger.error(f"PyMuPDF health check failed: {e}")
+ return False
diff --git a/nextcloud_mcp_server/embedding/bm25_provider.py b/nextcloud_mcp_server/embedding/bm25_provider.py
index d0f088a..df20e07 100644
--- a/nextcloud_mcp_server/embedding/bm25_provider.py
+++ b/nextcloud_mcp_server/embedding/bm25_provider.py
@@ -53,7 +53,7 @@ class BM25SparseEmbeddingProvider:
"values": sparse_embedding.values.tolist(),
}
- def encode_batch(self, texts: list[str]) -> list[dict[str, Any]]:
+ async def encode_batch(self, texts: list[str]) -> list[dict[str, Any]]:
"""
Generate BM25 sparse embeddings for multiple texts (batched).
@@ -63,7 +63,12 @@ class BM25SparseEmbeddingProvider:
Returns:
List of dictionaries with 'indices' and 'values' for each text
"""
- sparse_embeddings = list(self.model.embed(texts))
+ import anyio
+
+ # Run CPU-bound BM25 encoding in thread pool to avoid blocking event loop
+ sparse_embeddings = await anyio.to_thread.run_sync(
+ lambda: list(self.model.embed(texts))
+ )
return [
{
diff --git a/nextcloud_mcp_server/search/bm25_hybrid.py b/nextcloud_mcp_server/search/bm25_hybrid.py
index bdd3446..5acb861 100644
--- a/nextcloud_mcp_server/search/bm25_hybrid.py
+++ b/nextcloud_mcp_server/search/bm25_hybrid.py
@@ -181,7 +181,8 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
results = []
for result in search_response.points:
- doc_id = int(result.payload["doc_id"])
+ # doc_id can be int (notes) or str (files - file paths)
+ doc_id = result.payload["doc_id"]
doc_type = result.payload.get("doc_type", "note")
doc_key = (doc_id, doc_type)
diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py
index 89e9921..133a0cd 100644
--- a/nextcloud_mcp_server/search/semantic.py
+++ b/nextcloud_mcp_server/search/semantic.py
@@ -128,7 +128,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
results = []
for result in search_response.points:
- doc_id = int(result.payload["doc_id"])
+ # doc_id can be int (notes) or str (files - file paths)
+ doc_id = result.payload["doc_id"]
doc_type = result.payload.get("doc_type", "note")
doc_key = (doc_id, doc_type)
diff --git a/nextcloud_mcp_server/vector/document_chunker.py b/nextcloud_mcp_server/vector/document_chunker.py
index b2c1c3d..56a5605 100644
--- a/nextcloud_mcp_server/vector/document_chunker.py
+++ b/nextcloud_mcp_server/vector/document_chunker.py
@@ -15,6 +15,8 @@ class ChunkWithPosition:
text: str
start_offset: int # Character position where chunk starts
end_offset: int # Character position where chunk ends (exclusive)
+ page_number: int | None = None # Page number for PDF chunks (optional)
+ metadata: dict | None = None # Additional processor-specific metadata (optional)
class DocumentChunker:
@@ -50,7 +52,7 @@ class DocumentChunker:
strip_whitespace=True,
)
- def chunk_text(self, content: str) -> list[ChunkWithPosition]:
+ async def chunk_text(self, content: str) -> list[ChunkWithPosition]:
"""
Split text into overlapping chunks with position tracking.
@@ -66,12 +68,17 @@ class DocumentChunker:
Returns:
List of chunks with their character positions in the original content
"""
+ import anyio
+
# Handle empty content - return single empty chunk for backward compatibility
if not content:
return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]
- # Use LangChain to create documents with position tracking
- docs = self.splitter.create_documents([content])
+ # Run CPU-bound text splitting in thread pool to avoid blocking event loop
+ docs = await anyio.to_thread.run_sync(
+ self.splitter.create_documents,
+ [content],
+ )
# Convert LangChain Documents to ChunkWithPosition objects
chunks = [
diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py
index ba32135..a4fe100 100644
--- a/nextcloud_mcp_server/vector/processor.py
+++ b/nextcloud_mcp_server/vector/processor.py
@@ -29,6 +29,43 @@ from nextcloud_mcp_server.vector.scanner import DocumentTask
logger = logging.getLogger(__name__)
+def assign_page_numbers(chunks, page_boundaries):
+ """Assign page numbers to chunks based on page boundaries.
+
+ Each chunk gets the page number where most of its content appears.
+ For chunks spanning multiple pages, assigns the page containing the
+ majority of the chunk's characters.
+
+ Args:
+ chunks: List of ChunkWithPosition objects
+ page_boundaries: List of dicts with {page, start_offset, end_offset}
+
+ Returns:
+ None (modifies chunks in place)
+ """
+ if not page_boundaries:
+ return
+
+ for chunk in chunks:
+ # Find which page(s) this chunk overlaps with
+ max_overlap = 0
+ assigned_page = None
+
+ for boundary in page_boundaries:
+ # Calculate overlap between chunk and page
+ overlap_start = max(chunk.start_offset, boundary["start_offset"])
+ overlap_end = min(chunk.end_offset, boundary["end_offset"])
+ overlap = max(0, overlap_end - overlap_start)
+
+ # Assign to page with maximum overlap
+ if overlap > max_overlap:
+ max_overlap = overlap
+ assigned_page = boundary["page"]
+
+ if assigned_page is not None:
+ chunk.page_number = assigned_page
+
+
async def processor_task(
worker_id: int,
receive_stream: MemoryObjectReceiveStream[DocumentTask],
@@ -223,6 +260,32 @@ async def _index_document(
content = f"{document['title']}\n\n{document['content']}"
title = document["title"]
etag = document.get("etag", "")
+ file_metadata = {} # No file-specific metadata for notes
+ elif doc_task.doc_type == "file":
+ # For files, doc_id is the file path
+ file_path = doc_task.doc_id
+
+ # Read file content via WebDAV
+ content_bytes, content_type = await nc_client.webdav.read_file(file_path)
+
+ # Use document processor registry to extract text
+ from nextcloud_mcp_server.document_processors import get_registry
+
+ registry = get_registry()
+
+ try:
+ result = await registry.process(
+ content=content_bytes,
+ content_type=content_type,
+ filename=file_path,
+ )
+ content = result.text
+ file_metadata = result.metadata
+ title = file_metadata.get("title") or file_path.split("/")[-1]
+ etag = "" # WebDAV read_file doesn't return etag
+ except Exception as e:
+ logger.error(f"Failed to process file {file_path}: {e}")
+ raise
else:
raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
@@ -231,7 +294,11 @@ async def _index_document(
chunk_size=settings.document_chunk_size,
overlap=settings.document_chunk_overlap,
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
+
+ # Assign page numbers to chunks if page boundaries are available (PDFs)
+ if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
+ assign_page_numbers(chunks, file_metadata["page_boundaries"])
# Extract chunk texts for embedding
chunk_texts = [chunk.text for chunk in chunks]
@@ -242,7 +309,7 @@ async def _index_document(
# Generate sparse embeddings (BM25 for keyword matching)
bm25_service = get_bm25_service()
- sparse_embeddings = bm25_service.encode_batch(chunk_texts)
+ sparse_embeddings = await bm25_service.encode_batch(chunk_texts)
# Prepare Qdrant points
indexed_at = int(time.time())
@@ -277,6 +344,22 @@ async def _index_document(
"chunk_start_offset": chunk.start_offset,
"chunk_end_offset": chunk.end_offset,
"metadata_version": 2, # v2 includes position metadata
+ # File-specific metadata (PDF, etc.)
+ **(
+ {
+ "file_path": doc_task.doc_id,
+ "mime_type": file_metadata.get("content_type", ""),
+ "file_size": file_metadata.get("file_size"),
+ "page_number": chunk.page_number,
+ "page_count": file_metadata.get("page_count"),
+ "author": file_metadata.get("author"),
+ "creation_date": file_metadata.get("creation_date"),
+ "has_images": file_metadata.get("has_images", False),
+ "image_count": file_metadata.get("image_count", 0),
+ }
+ if doc_task.doc_type == "file"
+ else {}
+ ),
},
)
)
diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py
index 9d45a8f..3a1db66 100644
--- a/nextcloud_mcp_server/vector/scanner.py
+++ b/nextcloud_mcp_server/vector/scanner.py
@@ -4,6 +4,7 @@ Periodically scans enabled users' content and queues changed documents for proce
"""
import logging
+import os
import time
from dataclasses import dataclass
@@ -309,7 +310,148 @@ async def scan_user_documents(
)
_potentially_deleted[doc_key] = current_time
+ # Scan tagged PDF files (after notes)
+ # Get indexed files from Qdrant (separate query for doc_type="file")
+ indexed_files = {}
+ if not initial_sync:
+ file_scroll_result = await qdrant_client.scroll(
+ collection_name=settings.get_collection_name(),
+ scroll_filter=Filter(
+ must=[
+ FieldCondition(key="user_id", match=MatchValue(value=user_id)),
+ FieldCondition(key="doc_type", match=MatchValue(value="file")),
+ ]
+ ),
+ limit=10000, # Reasonable limit for file count
+ with_payload=["doc_id", "indexed_at"],
+ with_vectors=False,
+ )
+
+ indexed_files = {
+ point.payload["doc_id"]: point.payload["indexed_at"]
+ for point in file_scroll_result[0]
+ }
+
+ logger.debug(f"Found {len(indexed_files)} indexed files in Qdrant")
+
+ # Scan for tagged PDF files
+ file_count = 0
+ file_queued = 0
+ nextcloud_file_paths = set()
+
+ try:
+ # Find files with vector-index tag using OCS Tags API
+ settings = get_settings()
+ tag_name = os.getenv("VECTOR_SYNC_PDF_TAG", "vector-index")
+ # Use NextcloudClient.find_files_by_tag() which uses proper OCS API
+ # and filters by PDF MIME type
+ tagged_files = await nc_client.find_files_by_tag(
+ tag_name, mime_type_filter="application/pdf"
+ )
+
+ for file_info in tagged_files:
+ # Files are already filtered by MIME type in find_files_by_tag()
+ file_count += 1
+ file_path = file_info["path"]
+ nextcloud_file_paths.add(file_path)
+
+ # Use last_modified timestamp if available, otherwise use current time
+ modified_at = file_info.get("last_modified_timestamp", int(time.time()))
+ if isinstance(file_info.get("last_modified"), str):
+ # Parse RFC 2822 date format if needed
+ from email.utils import parsedate_to_datetime
+
+ try:
+ dt = parsedate_to_datetime(file_info["last_modified"])
+ modified_at = int(dt.timestamp())
+ except (ValueError, KeyError):
+ pass
+
+ if initial_sync:
+ # Send everything on first sync
+ await send_stream.send(
+ DocumentTask(
+ user_id=user_id,
+ doc_id=file_path,
+ doc_type="file",
+ operation="index",
+ modified_at=modified_at,
+ )
+ )
+ file_queued += 1
+ else:
+ # Incremental sync: compare with indexed state
+ indexed_at = indexed_files.get(file_path)
+
+ # If file reappeared, remove from potentially_deleted
+ file_key = (user_id, file_path)
+ if file_key in _potentially_deleted:
+ logger.debug(
+ f"File {file_path} reappeared, removing from deletion grace period"
+ )
+ del _potentially_deleted[file_key]
+
+ # Send if never indexed or modified since last index
+ if indexed_at is None or modified_at > indexed_at:
+ await send_stream.send(
+ DocumentTask(
+ user_id=user_id,
+ doc_id=file_path,
+ doc_type="file",
+ operation="index",
+ modified_at=modified_at,
+ )
+ )
+ file_queued += 1
+
+ logger.info(
+ f"[SCAN-{scan_id}] Found {file_count} tagged PDFs for {user_id}"
+ )
+ record_vector_sync_scan(file_count)
+
+ # Check for deleted files (not initial sync)
+ if not initial_sync:
+ for file_path in indexed_files:
+ if file_path not in nextcloud_file_paths:
+ file_key = (user_id, file_path)
+
+ if file_key in _potentially_deleted:
+ # Check if grace period elapsed
+ first_missing_time = _potentially_deleted[file_key]
+ time_missing = current_time - first_missing_time
+
+ if time_missing >= grace_period:
+ # Grace period elapsed, send for deletion
+ logger.info(
+ f"File {file_path} missing for {time_missing:.1f}s "
+ f"(>{grace_period:.1f}s grace period), sending deletion"
+ )
+ await send_stream.send(
+ DocumentTask(
+ user_id=user_id,
+ doc_id=file_path,
+ doc_type="file",
+ operation="delete",
+ modified_at=0,
+ )
+ )
+ file_queued += 1
+ del _potentially_deleted[file_key]
+ else:
+ # First time missing, add to grace period tracking
+ logger.debug(
+ f"File {file_path} missing for first time, starting grace period"
+ )
+ _potentially_deleted[file_key] = current_time
+
+ except Exception as e:
+ logger.warning(f"Failed to scan tagged files for {user_id}: {e}")
+
+ queued += file_queued
+
if queued > 0:
- logger.info(f"Sent {queued} documents for incremental sync: {user_id}")
+ logger.info(
+ f"Sent {queued} documents ({file_queued} files) for incremental sync: {user_id}"
+ )
else:
logger.debug(f"No changes detected for {user_id}")
diff --git a/pyproject.toml b/pyproject.toml
index f4bffd1..5481b0e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,9 @@ dependencies = [
"python-json-logger>=3.2.0", # Structured JSON logging
"jinja2>=3.1.6",
"langchain-text-splitters>=1.0.0",
+ "pymupdf>=1.26.6",
+ "pymupdf4llm>=0.2.2",
+ "pymupdf-layout>=1.26.6",
]
classifiers = [
"Development Status :: 4 - Beta",
diff --git a/tests/integration/test_pdf_indexing.py b/tests/integration/test_pdf_indexing.py
new file mode 100644
index 0000000..821ae37
--- /dev/null
+++ b/tests/integration/test_pdf_indexing.py
@@ -0,0 +1,361 @@
+"""Integration tests for PDF document indexing and semantic search.
+
+These tests validate the complete PDF processing flow:
+1. Process PDF with PyMuPDFProcessor
+2. Chunk extracted text with page numbers
+3. Index chunks into Qdrant with metadata
+4. Perform semantic search on PDF content
+5. Verify page numbers and metadata are preserved
+"""
+
+import pymupdf
+import pytest
+from qdrant_client import AsyncQdrantClient
+from qdrant_client.models import Distance, PointStruct, VectorParams
+
+from nextcloud_mcp_server.document_processors.pymupdf import PyMuPDFProcessor
+from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider
+from nextcloud_mcp_server.vector.document_chunker import (
+ ChunkWithPosition,
+ RecursiveCharacterTextSplitter,
+)
+
+pytestmark = pytest.mark.integration
+
+
+def create_test_pdf() -> bytes:
+ """Create a small test PDF with multiple pages."""
+ doc = pymupdf.open()
+
+ # Page 1: Introduction
+ page1 = doc.new_page(width=595, height=842) # A4 size
+ page1.insert_text(
+ (50, 50),
+ "Nextcloud Administration Guide\n\n"
+ "Chapter 1: Introduction\n\n"
+ "Nextcloud is a self-hosted file sharing and collaboration platform. "
+ "It provides secure file storage, sharing, and synchronization across devices. "
+ "This guide covers installation, configuration, and maintenance of Nextcloud.",
+ )
+
+ # Page 2: Installation
+ page2 = doc.new_page(width=595, height=842)
+ page2.insert_text(
+ (50, 50),
+ "Chapter 2: Installation\n\n"
+ "System Requirements:\n"
+ "- PHP 8.0 or higher\n"
+ "- MySQL 8.0 or MariaDB 10.5\n"
+ "- Apache or Nginx web server\n\n"
+ "Installation steps:\n"
+ "1. Download Nextcloud package\n"
+ "2. Extract to web server directory\n"
+ "3. Configure database connection\n"
+ "4. Run installation wizard",
+ )
+
+ # Page 3: Configuration
+ page3 = doc.new_page(width=595, height=842)
+ page3.insert_text(
+ (50, 50),
+ "Chapter 3: Configuration\n\n"
+ "Database Configuration:\n"
+ "Edit config/config.php to set database parameters. "
+ "Configure database host, username, password, and database name. "
+ "For optimal performance, use MySQL or MariaDB.\n\n"
+ "Security Settings:\n"
+ "Enable HTTPS, configure trusted domains, and set up firewall rules.",
+ )
+
+ # Convert to bytes
+ pdf_bytes = doc.tobytes()
+ doc.close()
+
+ return pdf_bytes
+
+
+@pytest.fixture
+async def simple_embedding_provider():
+ """Simple in-process embedding provider for testing."""
+ return SimpleEmbeddingProvider(dimension=384)
+
+
+@pytest.fixture
+async def qdrant_test_client():
+ """Qdrant client for testing (in-memory)."""
+ client = AsyncQdrantClient(":memory:")
+ yield client
+ await client.close()
+
+
+@pytest.fixture
+async def test_collection(qdrant_test_client: AsyncQdrantClient):
+ """Create test collection in Qdrant."""
+ collection_name = "test_pdf_indexing"
+
+ # Create collection
+ await qdrant_test_client.create_collection(
+ collection_name=collection_name,
+ vectors_config=VectorParams(size=384, distance=Distance.COSINE),
+ )
+
+ yield collection_name
+
+ # Cleanup
+ try:
+ await qdrant_test_client.delete_collection(collection_name)
+ except Exception:
+ pass
+
+
+@pytest.fixture
+def pymupdf_processor():
+ """PyMuPDF processor for testing (without image extraction)."""
+ return PyMuPDFProcessor(extract_images=False)
+
+
+async def test_pymupdf_processor_extracts_text_and_metadata(pymupdf_processor):
+ """Test PyMuPDF processor extracts text and metadata from PDF."""
+ pdf_bytes = create_test_pdf()
+
+ result = await pymupdf_processor.process(
+ content=pdf_bytes,
+ content_type="application/pdf",
+ filename="test-admin-guide.pdf",
+ )
+
+ # Verify result structure
+ assert result.success is True
+ assert result.processor == "pymupdf"
+ assert result.text is not None
+ assert len(result.text) > 0
+
+ # Verify extracted text contains expected content
+ assert "Nextcloud Administration Guide" in result.text
+ assert "Chapter 1: Introduction" in result.text
+ assert "Chapter 2: Installation" in result.text
+ assert "Chapter 3: Configuration" in result.text
+ assert "PHP 8.0 or higher" in result.text
+ assert "MySQL" in result.text
+
+ # Verify metadata
+ assert result.metadata is not None
+ assert result.metadata["page_count"] == 3
+ assert result.metadata["filename"] == "test-admin-guide.pdf"
+ assert "format" in result.metadata
+
+
+async def test_document_chunker_preserves_page_numbers():
+ """Test that document chunker can handle chunks with page number metadata."""
+ # Create chunks with page numbers
+ chunks = [
+ ChunkWithPosition(
+ text="Chapter 1 content on page 1",
+ start_offset=0,
+ end_offset=28,
+ page_number=1,
+ ),
+ ChunkWithPosition(
+ text="Chapter 2 content on page 2",
+ start_offset=29,
+ end_offset=57,
+ page_number=2,
+ ),
+ ChunkWithPosition(
+ text="Chapter 3 content on page 3",
+ start_offset=58,
+ end_offset=86,
+ page_number=3,
+ ),
+ ]
+
+ # Verify page numbers are preserved
+ assert chunks[0].page_number == 1
+ assert chunks[1].page_number == 2
+ assert chunks[2].page_number == 3
+
+
+async def test_pdf_indexing_and_search_flow(
+ pymupdf_processor: PyMuPDFProcessor,
+ qdrant_test_client: AsyncQdrantClient,
+ test_collection: str,
+ simple_embedding_provider: SimpleEmbeddingProvider,
+):
+ """Test complete PDF indexing and semantic search flow."""
+
+ # Step 1: Process PDF with PyMuPDF
+ pdf_bytes = create_test_pdf()
+ result = await pymupdf_processor.process(
+ content=pdf_bytes,
+ content_type="application/pdf",
+ filename="/Documents/admin-guide.pdf",
+ )
+
+ assert result.success is True
+ assert result.metadata["page_count"] == 3
+
+ # Step 2: Chunk the extracted text
+ # Note: In real implementation, we'd track which chunk came from which page
+ # For this test, we'll simulate by creating chunks manually
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+ chunks = splitter.split_text(result.text)
+
+ assert len(chunks) > 0
+
+ # Step 3: Index chunks into Qdrant with PDF metadata
+ points = []
+ for idx, chunk_text in enumerate(chunks):
+ embedding = await simple_embedding_provider.embed(chunk_text)
+
+ # Simulate page number assignment (in real implementation, this would be tracked)
+ # For simplicity, assign page based on content
+ page_number = 1
+ if "Chapter 2" in chunk_text or "Installation" in chunk_text:
+ page_number = 2
+ elif "Chapter 3" in chunk_text or "Configuration" in chunk_text:
+ page_number = 3
+
+ points.append(
+ PointStruct(
+ id=idx,
+ vector=embedding,
+ payload={
+ "user_id": "admin",
+ "doc_id": "/Documents/admin-guide.pdf",
+ "doc_type": "file",
+ "title": "Nextcloud Administration Guide",
+ "file_path": "/Documents/admin-guide.pdf",
+ "mime_type": "application/pdf",
+ "page_number": page_number,
+ "page_count": result.metadata["page_count"],
+ "chunk_index": idx,
+ "excerpt": chunk_text[:200],
+ },
+ )
+ )
+
+ await qdrant_test_client.upsert(
+ collection_name=test_collection, points=points, wait=True
+ )
+
+ # Step 4: Perform semantic search for installation instructions
+ query = "how to install Nextcloud system requirements"
+ query_embedding = await simple_embedding_provider.embed(query)
+
+ response = await qdrant_test_client.query_points(
+ collection_name=test_collection,
+ query=query_embedding,
+ limit=3,
+ score_threshold=0.0,
+ )
+
+ # Verify search results
+ assert len(response.points) > 0
+
+ # Top result should be from installation chapter (page 2)
+ top_result = response.points[0]
+ assert top_result.payload["doc_type"] == "file"
+ assert top_result.payload["file_path"] == "/Documents/admin-guide.pdf"
+ assert (
+ "Installation" in top_result.payload["excerpt"]
+ or top_result.payload["page_number"] == 2
+ )
+
+ # Verify page number is preserved
+ assert top_result.payload["page_number"] in [1, 2, 3]
+ assert top_result.payload["page_count"] == 3
+
+ # Step 5: Search for configuration
+ query = "database configuration settings MySQL"
+ query_embedding = await simple_embedding_provider.embed(query)
+
+ response = await qdrant_test_client.query_points(
+ collection_name=test_collection,
+ query=query_embedding,
+ limit=3,
+ score_threshold=0.0,
+ )
+
+ assert len(response.points) > 0
+
+ # Should find configuration chapter (page 3)
+ found_config = any(
+ "Configuration" in r.payload["excerpt"] or r.payload["page_number"] == 3
+ for r in response.points[:2]
+ )
+ assert found_config
+
+
+async def test_pdf_search_with_filters(
+ pymupdf_processor: PyMuPDFProcessor,
+ qdrant_test_client: AsyncQdrantClient,
+ test_collection: str,
+ simple_embedding_provider: SimpleEmbeddingProvider,
+):
+ """Test PDF search with metadata filters."""
+ from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+ # Process and index PDF
+ pdf_bytes = create_test_pdf()
+ result = await pymupdf_processor.process(
+ content=pdf_bytes,
+ content_type="application/pdf",
+ filename="/Documents/admin-guide.pdf",
+ )
+
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+ chunks = splitter.split_text(result.text)
+
+ # Index with metadata
+ points = []
+ for idx, chunk_text in enumerate(chunks):
+ embedding = await simple_embedding_provider.embed(chunk_text)
+
+ points.append(
+ PointStruct(
+ id=idx,
+ vector=embedding,
+ payload={
+ "user_id": "admin",
+ "doc_id": "/Documents/admin-guide.pdf",
+ "doc_type": "file",
+ "mime_type": "application/pdf",
+ "excerpt": chunk_text[:200],
+ },
+ )
+ )
+
+ await qdrant_test_client.upsert(
+ collection_name=test_collection, points=points, wait=True
+ )
+
+ # Search with filter for PDFs only
+ query = "Nextcloud installation"
+ query_embedding = await simple_embedding_provider.embed(query)
+
+ response = await qdrant_test_client.query_points(
+ collection_name=test_collection,
+ query=query_embedding,
+ query_filter=Filter(
+ must=[FieldCondition(key="doc_type", match=MatchValue(value="file"))]
+ ),
+ limit=3,
+ )
+
+ # All results should be from file documents
+ assert len(response.points) > 0
+ for result in response.points:
+ assert result.payload["doc_type"] == "file"
+ assert result.payload["mime_type"] == "application/pdf"
+
+
+async def test_pymupdf_health_check(pymupdf_processor: PyMuPDFProcessor):
+ """Test PyMuPDF processor health check."""
+ is_healthy = await pymupdf_processor.health_check()
+ assert is_healthy is True
+
+
+async def test_pymupdf_supports_pdf_mime_type(pymupdf_processor: PyMuPDFProcessor):
+ """Test PyMuPDF processor declares PDF support."""
+ assert "application/pdf" in pymupdf_processor.supported_mime_types
+ assert pymupdf_processor.name == "pymupdf"
diff --git a/tests/unit/test_document_chunker.py b/tests/unit/test_document_chunker.py
index 8a1d90f..66102a7 100644
--- a/tests/unit/test_document_chunker.py
+++ b/tests/unit/test_document_chunker.py
@@ -9,12 +9,12 @@ from nextcloud_mcp_server.vector.document_chunker import (
class TestDocumentChunkerPositions:
"""Test suite for DocumentChunker position tracking functionality."""
- def test_single_chunk_simple_text(self):
+ async def test_single_chunk_simple_text(self):
"""Test that single-chunk documents return correct positions."""
chunker = DocumentChunker(chunk_size=2048, overlap=200)
content = "This is a short document."
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
assert len(chunks) == 1
assert isinstance(chunks[0], ChunkWithPosition)
@@ -22,7 +22,7 @@ class TestDocumentChunkerPositions:
assert chunks[0].start_offset == 0
assert chunks[0].end_offset == len(content)
- def test_multiple_chunks_positions(self):
+ async def test_multiple_chunks_positions(self):
"""Test that multi-chunk documents have correct positions."""
# Use small chunk size to force multiple chunks
chunker = DocumentChunker(chunk_size=50, overlap=10)
@@ -34,7 +34,7 @@ class TestDocumentChunkerPositions:
"This is the fourth sentence adding more context."
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Verify we got multiple chunks
assert len(chunks) > 1
@@ -61,12 +61,12 @@ class TestDocumentChunkerPositions:
extracted = content[chunk.start_offset : chunk.end_offset]
assert extracted == chunk.text
- def test_chunk_positions_with_whitespace(self):
+ async def test_chunk_positions_with_whitespace(self):
"""Test position tracking with various whitespace."""
chunker = DocumentChunker(chunk_size=30, overlap=5)
content = "First sentence here. Second sentence.\n\nThird sentence.\tFourth sentence."
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Verify positions correctly handle whitespace
for chunk in chunks:
@@ -75,19 +75,19 @@ class TestDocumentChunkerPositions:
# LangChain strips whitespace by default
assert len(chunk.text.strip()) > 0
- def test_empty_content(self):
+ async def test_empty_content(self):
"""Test that empty content returns empty chunk."""
chunker = DocumentChunker(chunk_size=2048, overlap=200)
content = ""
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
assert len(chunks) == 1
assert chunks[0].text == ""
assert chunks[0].start_offset == 0
assert chunks[0].end_offset == 0
- def test_chunk_overlap_positions(self):
+ async def test_chunk_overlap_positions(self):
"""Test that overlapping chunks have correct positions."""
chunker = DocumentChunker(chunk_size=50, overlap=15)
content = (
@@ -97,7 +97,7 @@ class TestDocumentChunkerPositions:
"This is sentence four adding details."
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Verify overlap exists if we have multiple chunks
if len(chunks) > 1:
@@ -112,14 +112,14 @@ class TestDocumentChunkerPositions:
# With overlap, next chunk may start before current ends
assert next_chunk.start_offset <= current_chunk.end_offset
- def test_unicode_content_positions(self):
+ async def test_unicode_content_positions(self):
"""Test position tracking with Unicode characters."""
chunker = DocumentChunker(chunk_size=50, overlap=10)
content = (
"Hello 世界. こんにちは there. мир Привет world. שלום مرحبا 你好 friend."
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Verify all chunks extract correctly
for chunk in chunks:
@@ -131,7 +131,7 @@ class TestDocumentChunkerPositions:
assert chunks[0].start_offset == 0
assert chunks[0].end_offset == len(content)
- def test_realistic_note_content(self):
+ async def test_realistic_note_content(self):
"""Test with realistic note content similar to Nextcloud Notes."""
chunker = DocumentChunker(chunk_size=200, overlap=50)
content = """My Project Notes
@@ -152,7 +152,7 @@ position tracking for each chunk.
This allows us to highlight the exact chunk that matched a search query,
which builds trust in the RAG system."""
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Should have multiple chunks
assert len(chunks) > 1
@@ -168,7 +168,7 @@ which builds trust in the RAG system."""
assert chunk.end_offset <= len(content)
assert chunk.start_offset < chunk.end_offset
- def test_semantic_boundary_preservation(self):
+ async def test_semantic_boundary_preservation(self):
"""Test that LangChain creates semantically coherent chunks."""
chunker = DocumentChunker(chunk_size=100, overlap=20)
content = (
@@ -178,7 +178,7 @@ which builds trust in the RAG system."""
"Fourth sentence ends."
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Verify all chunks are extractable using their positions
for chunk in chunks:
@@ -193,7 +193,7 @@ which builds trust in the RAG system."""
assert chunk.end_offset <= len(content)
assert chunk.start_offset < chunk.end_offset
- def test_paragraph_boundary_preservation(self):
+ async def test_paragraph_boundary_preservation(self):
"""Test that LangChain preserves paragraph boundaries."""
chunker = DocumentChunker(chunk_size=80, overlap=15)
content = """First paragraph here.
@@ -204,7 +204,7 @@ Third paragraph here.
Fourth paragraph here."""
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# LangChain should prefer splitting at paragraph boundaries (\n\n)
# Verify we got multiple chunks
@@ -215,7 +215,7 @@ Fourth paragraph here."""
extracted = content[chunk.start_offset : chunk.end_offset]
assert extracted == chunk.text
- def test_default_parameters(self):
+ async def test_default_parameters(self):
"""Test that default parameters work correctly."""
chunker = DocumentChunker() # Use defaults: 2048 chars, 200 overlap
@@ -224,14 +224,14 @@ Fourth paragraph here."""
"This is a short note with a few sentences. It should fit in one chunk."
)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
assert len(chunks) == 1
assert chunks[0].text == content
assert chunks[0].start_offset == 0
assert chunks[0].end_offset == len(content)
- def test_large_document_chunking(self):
+ async def test_large_document_chunking(self):
"""Test chunking of a large document."""
chunker = DocumentChunker(chunk_size=100, overlap=20)
@@ -244,7 +244,7 @@ Fourth paragraph here."""
]
content = "\n\n".join(paragraphs)
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
# Should create multiple chunks
assert len(chunks) > 1
@@ -261,12 +261,12 @@ Fourth paragraph here."""
assert chunks[0].start_offset == 0
assert chunks[-1].end_offset == len(content)
- def test_position_tracking_with_overlap(self):
+ async def test_position_tracking_with_overlap(self):
"""Test that position tracking works correctly with overlap."""
chunker = DocumentChunker(chunk_size=50, overlap=15)
content = "A" * 25 + ". " + "B" * 25 + ". " + "C" * 25 + ". " + "D" * 25 + "."
- chunks = chunker.chunk_text(content)
+ chunks = await chunker.chunk_text(content)
if len(chunks) > 1:
# Verify overlap creates correct positions
diff --git a/uv.lock b/uv.lock
index ef66995..75307d2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1925,6 +1925,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" },
]
+[[package]]
+name = "networkx"
+version = "3.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" },
+]
+
[[package]]
name = "nextcloud-mcp-server"
version = "0.44.0"
@@ -1952,6 +1961,9 @@ dependencies = [
{ name = "prometheus-client" },
{ name = "pydantic" },
{ name = "pyjwt", extra = ["crypto"] },
+ { name = "pymupdf" },
+ { name = "pymupdf-layout" },
+ { name = "pymupdf4llm" },
{ name = "python-json-logger" },
{ name = "pythonvcard4" },
{ name = "qdrant-client" },
@@ -1997,6 +2009,9 @@ requires-dist = [
{ name = "prometheus-client", specifier = ">=0.21.0" },
{ name = "pydantic", specifier = ">=2.11.4" },
{ name = "pyjwt", extras = ["crypto"], specifier = ">=2.8.0" },
+ { name = "pymupdf", specifier = ">=1.26.6" },
+ { name = "pymupdf-layout", specifier = ">=1.26.6" },
+ { name = "pymupdf4llm", specifier = ">=0.2.2" },
{ name = "python-json-logger", specifier = ">=3.2.0" },
{ name = "pythonvcard4", specifier = ">=0.2.0" },
{ name = "qdrant-client", specifier = ">=1.7.0" },
@@ -2969,6 +2984,52 @@ crypto = [
{ name = "cryptography" },
]
+[[package]]
+name = "pymupdf"
+version = "1.26.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/d7/a6f0e03a117fa2ad79c4b898203bb212b17804f92558a6a339298faca7bb/pymupdf-1.26.6.tar.gz", hash = "sha256:a2b4531cd4ab36d6f1f794bb6d3c33b49bda22f36d58bb1f3e81cbc10183bd2b", size = 84322494, upload-time = "2025-11-05T15:20:46.786Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/9e/5c/dec354eee5fe4966c715f33818ed4193e0e6c986cf8484de35b6c167fb8e/pymupdf-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e46f320a136ad55e5219e8f0f4061bdf3e4c12b126d2740d5a49f73fae7ea176", size = 23178988, upload-time = "2025-11-05T14:31:19.834Z" },
+ { url = "https://files.pythonhosted.org/packages/ec/a0/11adb742d18142bd623556cd3b5d64649816decc5eafd30efc9498657e76/pymupdf-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:6844cd2396553c0fa06de4869d5d5ecb1260e6fc3b9d85abe8fa35f14dd9d688", size = 22469764, upload-time = "2025-11-05T14:32:34.654Z" },
+ { url = "https://files.pythonhosted.org/packages/e4/c8/377cf20e31f58d4c243bfcf2d3cb7466d5b97003b10b9f1161f11eb4a994/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:617ba69e02c44f0da1c0e039ea4a26cf630849fd570e169c71daeb8ac52a81d6", size = 23502227, upload-time = "2025-11-06T11:03:56.934Z" },
+ { url = "https://files.pythonhosted.org/packages/4f/bf/6e02e3d84b32c137c71a0a3dcdba8f2f6e9950619a3bc272245c7c06a051/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:7777d0b7124c2ebc94849536b6a1fb85d158df3b9d873935e63036559391534c", size = 24115381, upload-time = "2025-11-05T14:33:54.338Z" },
+ { url = "https://files.pythonhosted.org/packages/ab/9d/30f7fcb3776bfedde66c06297960debe4883b1667294a1ee9426c942e94d/pymupdf-1.26.6-cp310-abi3-win32.whl", hash = "sha256:8f3ef05befc90ca6bb0f12983200a7048d5bff3e1c1edef1bb3de60b32cb5274", size = 17203613, upload-time = "2025-11-05T17:19:47.494Z" },
+ { url = "https://files.pythonhosted.org/packages/f9/e8/989f4eaa369c7166dc24f0eaa3023f13788c40ff1b96701f7047421554a8/pymupdf-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:ce02ca96ed0d1acfd00331a4d41a34c98584d034155b06fd4ec0f051718de7ba", size = 18405680, upload-time = "2025-11-05T14:34:48.672Z" },
+]
+
+[[package]]
+name = "pymupdf-layout"
+version = "1.26.6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "networkx" },
+ { name = "numpy" },
+ { name = "onnxruntime" },
+ { name = "pymupdf" },
+ { name = "pyyaml" },
+]
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/70/86/31f8d05b36ebf43cca88d5c6415de46eb748e487b618a589671a610be8c8/pymupdf_layout-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:d632f83208db8b24600eb8ac54d3135fab6ab1f251a38fa6061e7470e81b9481", size = 12727222, upload-time = "2025-11-05T14:35:44.367Z" },
+ { url = "https://files.pythonhosted.org/packages/ff/d3/0e52d7d1e2f975843f5354ac3b210a98471b690105efc332d3c285bd794b/pymupdf_layout-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:f1d45f72ec08ef7f644928487e7a067df6df63172d682d0bb05158896d0d9c71", size = 12725266, upload-time = "2025-11-05T14:36:50.727Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/49/ad1a5edccc45477493d6a53a41df7620d6147febb897c3dd8354f413e154/pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0561b9485a6ac1a40bb1e2ec7a1648aa64e4be56dab2f39182b11a69e3e43024", size = 12732580, upload-time = "2025-11-06T11:04:09.065Z" },
+ { url = "https://files.pythonhosted.org/packages/a7/bd/3e049b359dd0c3a101ae915484b87ff73bfdedfb24a924e0a8e6783b33f3/pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:ee8e2bfed12d4b6421b27a1f89837ac09d8bc3f783f79670db397ec24614bf3d", size = 12732539, upload-time = "2025-11-05T14:38:01.244Z" },
+ { url = "https://files.pythonhosted.org/packages/f8/7a/69078bf16669f8361360321ea6bede4cbfede35bf3f4ca5842a7c2387825/pymupdf_layout-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:2305aac24fd6e12217afaaea8ec95be297be9b250b6077a3f4e92f7f9beeaf92", size = 12734904, upload-time = "2025-11-05T14:39:05.83Z" },
+]
+
+[[package]]
+name = "pymupdf4llm"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pymupdf" },
+ { name = "tabulate" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ec/26/e1226c5329d0c901cd42649e4e8d7544636524c31e95a84f4dcf7c25731d/pymupdf4llm-0.2.2.tar.gz", hash = "sha256:d8dee8451e31ec39daf691687403bf2a98ac7e7b8709400a4e13a582eab835c6", size = 59501, upload-time = "2025-11-17T11:10:20.204Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/26/23/08be1528f3ccb8c245e9a7b247255d6853a8e162b1451f4888f2006c52f0/pymupdf4llm-0.2.2-py3-none-any.whl", hash = "sha256:e7777d083f5f7c7daa804c3423804c309a7e096d682773c01e9dd4bb060f4a56", size = 62063, upload-time = "2025-11-17T11:10:22.452Z" },
+]
+
[[package]]
name = "pyreadline3"
version = "3.5.4"
@@ -3553,6 +3614,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
]
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
+]
+
[[package]]
name = "tenacity"
version = "9.1.2"