From b8010270c1f6ce20e09f36644b90ba882d046254 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 20 Nov 2025 02:37:07 +0100 Subject: [PATCH 01/19] fix: Add async/await, PDF metadata, and type safety fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses multiple issues with async operations, PDF metadata extraction, and type safety in document processing and search. ## Async/Await Fixes - processor.py:259 - Added await for chunker.chunk_text(content) - processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts) - tests/unit/test_document_chunker.py - Converted all 12 test methods to async ## PDF Metadata Enhancement - pymupdf.py:143 - Added file_size metadata extraction - pymupdf.py:145-206 - Refactored to extract text page-by-page - Manually loop through pages instead of using page_chunks=True - Generate page_boundaries metadata for precise page tracking - Works around pymupdf.layout.activate() breaking page_chunks=True - processor.py:32-66 - Added assign_page_numbers() helper function - Assigns page numbers to chunks based on overlap with page boundaries - Handles chunks spanning multiple pages - processor.py:298-300 - Call assign_page_numbers() for PDF files ## Type Safety Fixes - bm25_hybrid.py:184 - Removed int() conversion of doc_id - semantic.py:131 - Removed int() conversion of doc_id - viz_routes.py:275 - Removed int() conversion of doc_id - Added comments documenting that doc_id can be int (notes) or str (file paths) ## Testing - All 18 tests passing (12 unit + 6 integration) - No type errors in modified files - Container logs show successful processing - Vector viz searches working correctly 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/app.py | 20 + nextcloud_mcp_server/auth/viz_routes.py | 4 +- nextcloud_mcp_server/client/__init__.py | 56 +++ nextcloud_mcp_server/client/webdav.py | 347 +++++++++++++++++ nextcloud_mcp_server/config.py | 8 + .../document_processors/__init__.py | 6 + .../document_processors/pymupdf.py | 285 ++++++++++++++ .../embedding/bm25_provider.py | 9 +- nextcloud_mcp_server/search/bm25_hybrid.py | 3 +- nextcloud_mcp_server/search/semantic.py | 3 +- .../vector/document_chunker.py | 13 +- nextcloud_mcp_server/vector/processor.py | 87 ++++- nextcloud_mcp_server/vector/scanner.py | 144 ++++++- pyproject.toml | 3 + tests/integration/test_pdf_indexing.py | 361 ++++++++++++++++++ tests/unit/test_document_chunker.py | 48 +-- uv.lock | 70 ++++ 17 files changed, 1432 insertions(+), 35 deletions(-) create mode 100644 nextcloud_mcp_server/document_processors/pymupdf.py create mode 100644 tests/integration/test_pdf_indexing.py diff --git a/nextcloud_mcp_server/app.py b/nextcloud_mcp_server/app.py index 56fd4f5..f532bd4 100644 --- a/nextcloud_mcp_server/app.py +++ b/nextcloud_mcp_server/app.py @@ -122,6 +122,26 @@ def initialize_document_processors(): except Exception as e: logger.warning(f"Failed to register Tesseract processor: {e}") + # Register PyMuPDF processor (high priority, local, no API required) + if "pymupdf" in config["processors"]: + pymupdf_config = config["processors"]["pymupdf"] + try: + from nextcloud_mcp_server.document_processors.pymupdf import ( + PyMuPDFProcessor, + ) + + processor = PyMuPDFProcessor( + extract_images=pymupdf_config.get("extract_images", True), + image_dir=pymupdf_config.get("image_dir"), + ) + registry.register(processor, priority=15) # Higher than unstructured + logger.info( + f"Registered PyMuPDF processor: extract_images={pymupdf_config.get('extract_images', True)}" + ) + registered_count += 1 + except Exception as e: + logger.warning(f"Failed to register PyMuPDF processor: {e}") + # Register custom processor if "custom" in config["processors"]: custom_config = config["processors"]["custom"] diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py index 3497084..e1f2e4e 100644 --- a/nextcloud_mcp_server/auth/viz_routes.py +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -272,7 +272,9 @@ async def vector_visualization_search(request: Request) -> JSONResponse: doc_chunks = defaultdict(list) for point in points: if point.payload: - doc_id = int(point.payload.get("doc_id", 0)) + # doc_id can be int (for notes) or str (for files - file path) + # Keep original type instead of forcing to int + doc_id = point.payload.get("doc_id", 0) vector = extract_dense_vector(point) if vector is not None: doc_chunks[doc_id].append(vector) diff --git a/nextcloud_mcp_server/client/__init__.py b/nextcloud_mcp_server/client/__init__.py index 29dfc36..d6e9f24 100644 --- a/nextcloud_mcp_server/client/__init__.py +++ b/nextcloud_mcp_server/client/__init__.py @@ -130,6 +130,62 @@ class NextcloudClient: all_notes = self.notes.get_all_notes() return await self._notes_search.search_notes(all_notes, query) + async def find_files_by_tag( + self, tag_name: str, mime_type_filter: str | None = None + ) -> list[dict]: + """Find files by system tag name, optionally filtered by MIME type. + + This method coordinates tag lookup and file retrieval via WebDAV: + 1. Look up the tag ID by name + 2. Get all files with that tag (via REPORT with full metadata) + 3. Optionally filter by MIME type + + Args: + tag_name: Name of the system tag to search for (e.g., "vector-index") + mime_type_filter: Optional MIME type filter (e.g., "application/pdf") + + Returns: + List of file dictionaries with WebDAV properties (path, size, content_type, etc.) + + Raises: + RuntimeError: If tag lookup or file query fails + + Examples: + # Find all files with "vector-index" tag + files = await nc_client.find_files_by_tag("vector-index") + + # Find only PDFs with the tag + pdfs = await nc_client.find_files_by_tag("vector-index", "application/pdf") + """ + # Look up tag by name using WebDAV + tag = await self.webdav.get_tag_by_name(tag_name) + if not tag: + logger.debug(f"Tag '{tag_name}' not found, returning empty list") + return [] + + # Get files with this tag (returns full file info from REPORT) + files = await self.webdav.get_files_by_tag(tag["id"]) + if not files: + logger.debug(f"No files found with tag '{tag_name}'") + return [] + + logger.debug(f"Found {len(files)} files with tag '{tag_name}'") + + # Apply MIME type filter if specified + if mime_type_filter: + filtered_files = [ + f + for f in files + if f.get("content_type", "").startswith(mime_type_filter) + ] + logger.info( + f"Returning {len(filtered_files)} files with tag '{tag_name}' (filtered by {mime_type_filter})" + ) + return filtered_files + + logger.info(f"Returning {len(files)} files with tag '{tag_name}'") + return files + def _get_webdav_base_path(self) -> str: """Helper to get the base WebDAV path for the authenticated user.""" return f"/remote.php/dav/files/{self.username}" diff --git a/nextcloud_mcp_server/client/webdav.py b/nextcloud_mcp_server/client/webdav.py index c877e38..05f27df 100644 --- a/nextcloud_mcp_server/client/webdav.py +++ b/nextcloud_mcp_server/client/webdav.py @@ -821,6 +821,20 @@ class WebDAVClient(BaseNextcloudClient): item["file_id"] = int(value) if value else None elif tag == "favorite": item["is_favorite"] = value == "1" + elif tag == "tags": + # Tags can be comma-separated or have multiple child elements + if value: + # Handle comma-separated tags + item["tags"] = [ + t.strip() for t in value.split(",") if t.strip() + ] + else: + # Check for child tag elements (alternative format) + tag_elements = child.findall(".//{http://owncloud.org/ns}tag") + if tag_elements: + item["tags"] = [t.text for t in tag_elements if t.text] + else: + item["tags"] = [] elif tag == "permissions": item["permissions"] = value elif tag == "size": @@ -948,3 +962,336 @@ class WebDAVClient(BaseNextcloudClient): properties=properties, limit=limit, ) + + async def find_by_tag( + self, tag_name: str, scope: str = "", limit: Optional[int] = None + ) -> List[Dict[str, Any]]: + """Find files by tag name. + + DEPRECATED: Use NextcloudClient.find_files_by_tag() instead, which uses + the proper OCS Tags API rather than WebDAV SEARCH. + + Args: + tag_name: Tag to filter by (e.g., "vector-index") + scope: Directory path to search in (empty string for user root) + limit: Maximum number of results to return + + Returns: + List of files/directories with the specified tag + + Examples: + # Find all files tagged with "vector-index" + results = await find_by_tag("vector-index") + + # Find tagged files in a specific folder + results = await find_by_tag("vector-index", scope="Documents") + """ + # Use LIKE for tag matching since tags can be comma-separated + where_conditions = f""" + + + + + %{tag_name}% + + """ + + # Request tag property along with standard properties + properties = [ + "displayname", + "getcontentlength", + "getcontenttype", + "getlastmodified", + "resourcetype", + "getetag", + "fileid", + "tags", + ] + + return await self.search_files( + scope=scope, + where_conditions=where_conditions, + properties=properties, + limit=limit, + ) + + async def _get_file_info_by_id(self, file_id: int) -> Dict[str, Any]: + """Get file information by Nextcloud file ID using WebDAV. + + Args: + file_id: Nextcloud internal file ID + + Returns: + File information dictionary with path, size, content_type, etc. + + Raises: + HTTPStatusError: If file not found or request fails + """ + # Nextcloud allows accessing files by ID via special meta endpoint + meta_path = f"/remote.php/dav/meta/{file_id}/" + + propfind_body = """ + + + + + + + + + + + """ + + headers = {"Depth": "0", "Content-Type": "text/xml", "OCS-APIRequest": "true"} + + response = await self._make_request( + "PROPFIND", meta_path, content=propfind_body, headers=headers + ) + response.raise_for_status() + + # Parse the XML response + root = ET.fromstring(response.content) + responses = root.findall(".//{DAV:}response") + + if not responses: + raise RuntimeError(f"File ID {file_id} not found") + + response_elem = responses[0] + href = response_elem.find(".//{DAV:}href") + if href is None: + raise RuntimeError(f"No href in response for file ID {file_id}") + + propstat = response_elem.find(".//{DAV:}propstat") + if propstat is None: + raise RuntimeError(f"No propstat for file ID {file_id}") + + prop = propstat.find(".//{DAV:}prop") + if prop is None: + raise RuntimeError(f"No prop for file ID {file_id}") + + # Extract file path from displayname or construct from file ID + displayname_elem = prop.find(".//{DAV:}displayname") + name = ( + displayname_elem.text if displayname_elem is not None else f"file_{file_id}" + ) + + # Get file properties + size_elem = prop.find(".//{DAV:}getcontentlength") + size = int(size_elem.text) if size_elem is not None and size_elem.text else 0 + + content_type_elem = prop.find(".//{DAV:}getcontenttype") + content_type = content_type_elem.text if content_type_elem is not None else None + + modified_elem = prop.find(".//{DAV:}getlastmodified") + modified = modified_elem.text if modified_elem is not None else None + + etag_elem = prop.find(".//{DAV:}getetag") + etag = ( + etag_elem.text.strip('"') + if etag_elem is not None and etag_elem.text + else None + ) + + # Check if it's a directory + resourcetype = prop.find(".//{DAV:}resourcetype") + is_directory = ( + resourcetype is not None + and resourcetype.find(".//{DAV:}collection") is not None + ) + + # Try to get actual file path - meta endpoint doesn't give us the real path + # so we'll construct a reasonable path from the name + # The calling code in NextcloudClient will have the context to determine the actual path + file_info = { + "name": name, + "path": f"/{name}", # Placeholder - caller should use WebDAV to get real path if needed + "size": size, + "content_type": content_type, + "last_modified": modified, + "etag": etag, + "is_directory": is_directory, + "file_id": file_id, + } + + logger.debug(f"Retrieved file info for ID {file_id}: {name}") + return file_info + + async def get_tag_by_name(self, tag_name: str) -> dict[str, Any] | None: + """Get a system tag by its name via WebDAV. + + Args: + tag_name: Name of the tag to find (case-sensitive) + + Returns: + Tag dictionary if found, None otherwise + """ + # Use WebDAV PROPFIND to list all systemtags + propfind_body = """ + + + + + + + +""" + + response = await self._client.request( + "PROPFIND", + "/remote.php/dav/systemtags/", + headers={"Depth": "1"}, + content=propfind_body, + ) + response.raise_for_status() + + # Parse XML response + root = ET.fromstring(response.content) + ns = { + "d": "DAV:", + "oc": "http://owncloud.org/ns", + } + + for response_elem in root.findall("d:response", ns): + href = response_elem.find("d:href", ns) + if href is None or href.text == "/remote.php/dav/systemtags/": + # Skip the collection itself + continue + + propstat = response_elem.find("d:propstat", ns) + if propstat is None: + continue + + prop = propstat.find("d:prop", ns) + if prop is None: + continue + + # Extract tag properties + tag_id_elem = prop.find("oc:id", ns) + display_name_elem = prop.find("oc:display-name", ns) + user_visible_elem = prop.find("oc:user-visible", ns) + user_assignable_elem = prop.find("oc:user-assignable", ns) + + if display_name_elem is not None and display_name_elem.text == tag_name: + tag_info = { + "id": int(tag_id_elem.text) if tag_id_elem is not None else None, + "name": display_name_elem.text, + "userVisible": user_visible_elem.text.lower() == "true" + if user_visible_elem is not None + else True, + "userAssignable": user_assignable_elem.text.lower() == "true" + if user_assignable_elem is not None + else True, + } + logger.debug(f"Found tag '{tag_name}' with ID {tag_info['id']}") + return tag_info + + logger.debug(f"Tag '{tag_name}' not found") + return None + + async def get_files_by_tag(self, tag_id: int) -> list[dict[str, Any]]: + """Get all files tagged with a specific system tag via WebDAV REPORT. + + Args: + tag_id: Numeric ID of the tag + + Returns: + List of file info dictionaries with path, size, content_type, etc. + """ + # Use WebDAV REPORT method with systemtag filter, requesting all properties + report_body = f""" + + + + + + + + + + + {tag_id} + +""" + + response = await self._client.request( + "REPORT", + f"{self._get_webdav_base_path()}/", + content=report_body, + ) + response.raise_for_status() + + # Parse XML response + root = ET.fromstring(response.content) + ns = { + "d": "DAV:", + "oc": "http://owncloud.org/ns", + } + + files = [] + for response_elem in root.findall("d:response", ns): + # Extract href (file path) + href_elem = response_elem.find("d:href", ns) + if href_elem is None or not href_elem.text: + continue + + propstat = response_elem.find("d:propstat", ns) + if propstat is None: + continue + + prop = propstat.find("d:prop", ns) + if prop is None: + continue + + # Extract all properties + fileid_elem = prop.find("oc:fileid", ns) + displayname_elem = prop.find("d:displayname", ns) + contentlength_elem = prop.find("d:getcontentlength", ns) + contenttype_elem = prop.find("d:getcontenttype", ns) + lastmodified_elem = prop.find("d:getlastmodified", ns) + etag_elem = prop.find("d:getetag", ns) + + if fileid_elem is None or not fileid_elem.text: + continue + + # Decode href path and extract the file path + from urllib.parse import unquote + + href_path = unquote(href_elem.text) + # Remove WebDAV prefix to get user-relative path + webdav_prefix = f"/remote.php/dav/files/{self.username}/" + file_path = href_path.replace(webdav_prefix, "/") + + # Parse last modified timestamp + last_modified_timestamp = None + if lastmodified_elem is not None and lastmodified_elem.text: + from email.utils import parsedate_to_datetime + + try: + dt = parsedate_to_datetime(lastmodified_elem.text) + last_modified_timestamp = int(dt.timestamp()) + except Exception: + pass + + file_info = { + "id": int(fileid_elem.text), + "path": file_path, + "name": displayname_elem.text + if displayname_elem is not None + else file_path.split("/")[-1], + "size": int(contentlength_elem.text) + if contentlength_elem is not None and contentlength_elem.text + else 0, + "content_type": contenttype_elem.text + if contenttype_elem is not None + else "", + "last_modified": lastmodified_elem.text + if lastmodified_elem is not None + else None, + "last_modified_timestamp": last_modified_timestamp, + "etag": etag_elem.text if etag_elem is not None else None, + } + files.append(file_info) + + logger.debug(f"Found {len(files)} files with tag ID {tag_id}") + return files diff --git a/nextcloud_mcp_server/config.py b/nextcloud_mcp_server/config.py index b81d86c..d64e6a2 100644 --- a/nextcloud_mcp_server/config.py +++ b/nextcloud_mcp_server/config.py @@ -102,6 +102,14 @@ def get_document_processor_config() -> dict[str, Any]: "lang": os.getenv("TESSERACT_LANG", "eng"), } + # PyMuPDF configuration (local PDF processing) + if os.getenv("ENABLE_PYMUPDF", "true").lower() == "true": # Enabled by default + config["processors"]["pymupdf"] = { + "extract_images": os.getenv("PYMUPDF_EXTRACT_IMAGES", "true").lower() + == "true", + "image_dir": os.getenv("PYMUPDF_IMAGE_DIR"), # None = use temp directory + } + # Custom processor (via HTTP API) if os.getenv("ENABLE_CUSTOM_PROCESSOR", "false").lower() == "true": custom_url = os.getenv("CUSTOM_PROCESSOR_URL") diff --git a/nextcloud_mcp_server/document_processors/__init__.py b/nextcloud_mcp_server/document_processors/__init__.py index 9d5636b..1997883 100644 --- a/nextcloud_mcp_server/document_processors/__init__.py +++ b/nextcloud_mcp_server/document_processors/__init__.py @@ -1,12 +1,18 @@ """Document processing plugins for extracting text from various file formats.""" from .base import DocumentProcessor, ProcessingResult, ProcessorError +from .pymupdf import PyMuPDFProcessor from .registry import ProcessorRegistry, get_registry +# Register processors at module initialization +_registry = get_registry() +_registry.register(PyMuPDFProcessor(), priority=10) + __all__ = [ "DocumentProcessor", "ProcessingResult", "ProcessorError", "ProcessorRegistry", "get_registry", + "PyMuPDFProcessor", ] diff --git a/nextcloud_mcp_server/document_processors/pymupdf.py b/nextcloud_mcp_server/document_processors/pymupdf.py new file mode 100644 index 0000000..8d5c7e6 --- /dev/null +++ b/nextcloud_mcp_server/document_processors/pymupdf.py @@ -0,0 +1,285 @@ +"""Document processor using PyMuPDF (fitz) library.""" + +import logging +import pathlib +import tempfile +from collections.abc import Awaitable, Callable +from typing import Any, Optional + +import pymupdf +import pymupdf.layout +import pymupdf4llm + +from .base import DocumentProcessor, ProcessingResult, ProcessorError + +# Activate layout analysis for better text extraction +pymupdf.layout.activate() + +logger = logging.getLogger(__name__) + + +class PyMuPDFProcessor(DocumentProcessor): + """Document processor using PyMuPDF library for PDF processing. + + PyMuPDF (fitz) is a fast, local PDF processing library that extracts text, + metadata, and images without requiring external API calls. + + Features: + - Fast text extraction with layout preservation + - PDF metadata extraction (title, author, creation date, page count) + - Image extraction for future multimodal support + - Page number tracking for precise citations + """ + + SUPPORTED_TYPES = { + "application/pdf", + } + + def __init__( + self, + extract_images: bool = True, + image_dir: Optional[str | pathlib.Path] = None, + ): + """Initialize PyMuPDF processor. + + Args: + extract_images: Whether to extract embedded images from PDFs + image_dir: Directory to store extracted images (defaults to temp directory) + """ + self.extract_images = extract_images + + if image_dir is None: + self.image_dir = pathlib.Path(tempfile.gettempdir()) / "pdf-images" + else: + self.image_dir = pathlib.Path(image_dir) + + # Create image directory if it doesn't exist + if self.extract_images: + self.image_dir.mkdir(exist_ok=True, parents=True) + logger.info( + f"Initialized PyMuPDFProcessor with image extraction to {self.image_dir}" + ) + else: + logger.info("Initialized PyMuPDFProcessor without image extraction") + + @property + def name(self) -> str: + return "pymupdf" + + @property + def supported_mime_types(self) -> set[str]: + return self.SUPPORTED_TYPES + + async def process( + self, + content: bytes, + content_type: str, + filename: Optional[str] = None, + options: Optional[dict[str, Any]] = None, + progress_callback: Optional[ + Callable[[float, Optional[float], Optional[str]], Awaitable[None]] + ] = None, + ) -> ProcessingResult: + """Process a PDF document and extract text, metadata, and images. + + Args: + content: PDF document bytes + content_type: MIME type (should be application/pdf) + filename: Optional filename for better error messages + options: Processing options (currently unused) + progress_callback: Optional callback for progress updates + + Returns: + ProcessingResult with extracted text and metadata + + Raises: + ProcessorError: If PDF processing fails + """ + import anyio + + try: + if progress_callback: + await progress_callback(0, 100, "Processing PDF in background thread") + + # Run CPU-bound PDF processing in thread pool to avoid blocking event loop + result = await anyio.to_thread.run_sync( + self._process_sync, + content, + filename, + ) + + if progress_callback: + await progress_callback(100, 100, "Processing complete") + + return result + + except Exception as e: + error_msg = f"Failed to process PDF {filename or ''}: {e}" + logger.error(error_msg, exc_info=True) + raise ProcessorError(error_msg) from e + + def _process_sync( + self, + content: bytes, + filename: Optional[str] = None, + ) -> ProcessingResult: + """Synchronous PDF processing (runs in thread pool). + + Args: + content: PDF document bytes + filename: Optional filename for better error messages + + Returns: + ProcessingResult with extracted text and metadata + + Raises: + Exception: If PDF processing fails + """ + # Open PDF from bytes + doc = pymupdf.open("pdf", content) + + # Extract metadata from PDF + metadata = self._extract_metadata(doc, filename) + + # Add file size to metadata + metadata["file_size"] = len(content) + + # Extract text page-by-page to preserve page boundaries + # pymupdf.layout.activate() causes page_chunks=True to return a string, + # so we manually extract text per page instead. + page_boundaries = [] + current_offset = 0 + full_text_parts = [] + image_paths = [] + + for page_num in range(doc.page_count): + if self.extract_images: + # Generate unique directory for this PDF's images + pdf_id = filename.replace("/", "_") if filename else "unknown" + pdf_image_dir = self.image_dir / pdf_id + pdf_image_dir.mkdir(exist_ok=True, parents=True) + + # Extract page as markdown with images + page_md = pymupdf4llm.to_markdown( + doc, + pages=[page_num], # Extract single page + write_images=True, + image_path=pdf_image_dir, + page_chunks=False, # Single page, no chunking needed + ) + + # Collect image paths + if pdf_image_dir.exists(): + page_images = [str(p) for p in pdf_image_dir.glob("*")] + image_paths.extend(page_images) + else: + # Extract page as markdown without images + page_md = pymupdf4llm.to_markdown( + doc, + pages=[page_num], # Extract single page + write_images=False, + page_chunks=False, # Single page, no chunking needed + ) + + # Store page text + full_text_parts.append(page_md) + + # Store boundary info: {page (1-indexed), start, end} + page_boundaries.append( + { + "page": page_num + 1, # Convert to 1-indexed + "start_offset": current_offset, + "end_offset": current_offset + len(page_md), + } + ) + + current_offset += len(page_md) + + # Join all page texts + md_text = "".join(full_text_parts) + + # Store image metadata + metadata["has_images"] = len(image_paths) > 0 + if image_paths: + metadata["image_count"] = len(image_paths) + metadata["image_paths"] = image_paths + + # Add page boundaries to metadata for chunker to use + metadata["page_boundaries"] = page_boundaries + + # Close the document + doc.close() + + logger.info( + f"Successfully processed PDF {filename or ''}: " + f"{metadata['page_count']} pages, {len(md_text)} chars, " + f"{metadata.get('image_count', 0)} images" + ) + + return ProcessingResult( + text=md_text, + metadata=metadata, + processor=self.name, + success=True, + ) + + def _extract_metadata( + self, doc: pymupdf.Document, filename: Optional[str] + ) -> dict[str, Any]: + """Extract metadata from PDF document. + + Args: + doc: Opened PyMuPDF document + filename: Optional filename + + Returns: + Dictionary with PDF metadata + """ + metadata: dict[str, Any] = {} + + # Basic document info + metadata["page_count"] = doc.page_count + metadata["format"] = "PDF 1." + str( + doc.pdf_version() if hasattr(doc, "pdf_version") else "?" + ) + + if filename: + metadata["filename"] = filename + + # Extract PDF metadata dictionary + pdf_metadata = doc.metadata + if pdf_metadata: + # Standard PDF metadata fields + if pdf_metadata.get("title"): + metadata["title"] = pdf_metadata["title"] + if pdf_metadata.get("author"): + metadata["author"] = pdf_metadata["author"] + if pdf_metadata.get("subject"): + metadata["subject"] = pdf_metadata["subject"] + if pdf_metadata.get("keywords"): + metadata["keywords"] = pdf_metadata["keywords"] + if pdf_metadata.get("creator"): + metadata["creator"] = pdf_metadata["creator"] + if pdf_metadata.get("producer"): + metadata["producer"] = pdf_metadata["producer"] + if pdf_metadata.get("creationDate"): + metadata["creation_date"] = pdf_metadata["creationDate"] + if pdf_metadata.get("modDate"): + metadata["modification_date"] = pdf_metadata["modDate"] + + return metadata + + async def health_check(self) -> bool: + """Check if PyMuPDF is available and working. + + Returns: + True if processor is ready to use + """ + try: + # Try to create a simple PDF in memory + test_doc = pymupdf.open() + test_doc.close() + return True + except Exception as e: + logger.error(f"PyMuPDF health check failed: {e}") + return False diff --git a/nextcloud_mcp_server/embedding/bm25_provider.py b/nextcloud_mcp_server/embedding/bm25_provider.py index d0f088a..df20e07 100644 --- a/nextcloud_mcp_server/embedding/bm25_provider.py +++ b/nextcloud_mcp_server/embedding/bm25_provider.py @@ -53,7 +53,7 @@ class BM25SparseEmbeddingProvider: "values": sparse_embedding.values.tolist(), } - def encode_batch(self, texts: list[str]) -> list[dict[str, Any]]: + async def encode_batch(self, texts: list[str]) -> list[dict[str, Any]]: """ Generate BM25 sparse embeddings for multiple texts (batched). @@ -63,7 +63,12 @@ class BM25SparseEmbeddingProvider: Returns: List of dictionaries with 'indices' and 'values' for each text """ - sparse_embeddings = list(self.model.embed(texts)) + import anyio + + # Run CPU-bound BM25 encoding in thread pool to avoid blocking event loop + sparse_embeddings = await anyio.to_thread.run_sync( + lambda: list(self.model.embed(texts)) + ) return [ { diff --git a/nextcloud_mcp_server/search/bm25_hybrid.py b/nextcloud_mcp_server/search/bm25_hybrid.py index bdd3446..5acb861 100644 --- a/nextcloud_mcp_server/search/bm25_hybrid.py +++ b/nextcloud_mcp_server/search/bm25_hybrid.py @@ -181,7 +181,8 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm): results = [] for result in search_response.points: - doc_id = int(result.payload["doc_id"]) + # doc_id can be int (notes) or str (files - file paths) + doc_id = result.payload["doc_id"] doc_type = result.payload.get("doc_type", "note") doc_key = (doc_id, doc_type) diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py index 89e9921..133a0cd 100644 --- a/nextcloud_mcp_server/search/semantic.py +++ b/nextcloud_mcp_server/search/semantic.py @@ -128,7 +128,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm): results = [] for result in search_response.points: - doc_id = int(result.payload["doc_id"]) + # doc_id can be int (notes) or str (files - file paths) + doc_id = result.payload["doc_id"] doc_type = result.payload.get("doc_type", "note") doc_key = (doc_id, doc_type) diff --git a/nextcloud_mcp_server/vector/document_chunker.py b/nextcloud_mcp_server/vector/document_chunker.py index b2c1c3d..56a5605 100644 --- a/nextcloud_mcp_server/vector/document_chunker.py +++ b/nextcloud_mcp_server/vector/document_chunker.py @@ -15,6 +15,8 @@ class ChunkWithPosition: text: str start_offset: int # Character position where chunk starts end_offset: int # Character position where chunk ends (exclusive) + page_number: int | None = None # Page number for PDF chunks (optional) + metadata: dict | None = None # Additional processor-specific metadata (optional) class DocumentChunker: @@ -50,7 +52,7 @@ class DocumentChunker: strip_whitespace=True, ) - def chunk_text(self, content: str) -> list[ChunkWithPosition]: + async def chunk_text(self, content: str) -> list[ChunkWithPosition]: """ Split text into overlapping chunks with position tracking. @@ -66,12 +68,17 @@ class DocumentChunker: Returns: List of chunks with their character positions in the original content """ + import anyio + # Handle empty content - return single empty chunk for backward compatibility if not content: return [ChunkWithPosition(text="", start_offset=0, end_offset=0)] - # Use LangChain to create documents with position tracking - docs = self.splitter.create_documents([content]) + # Run CPU-bound text splitting in thread pool to avoid blocking event loop + docs = await anyio.to_thread.run_sync( + self.splitter.create_documents, + [content], + ) # Convert LangChain Documents to ChunkWithPosition objects chunks = [ diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py index ba32135..a4fe100 100644 --- a/nextcloud_mcp_server/vector/processor.py +++ b/nextcloud_mcp_server/vector/processor.py @@ -29,6 +29,43 @@ from nextcloud_mcp_server.vector.scanner import DocumentTask logger = logging.getLogger(__name__) +def assign_page_numbers(chunks, page_boundaries): + """Assign page numbers to chunks based on page boundaries. + + Each chunk gets the page number where most of its content appears. + For chunks spanning multiple pages, assigns the page containing the + majority of the chunk's characters. + + Args: + chunks: List of ChunkWithPosition objects + page_boundaries: List of dicts with {page, start_offset, end_offset} + + Returns: + None (modifies chunks in place) + """ + if not page_boundaries: + return + + for chunk in chunks: + # Find which page(s) this chunk overlaps with + max_overlap = 0 + assigned_page = None + + for boundary in page_boundaries: + # Calculate overlap between chunk and page + overlap_start = max(chunk.start_offset, boundary["start_offset"]) + overlap_end = min(chunk.end_offset, boundary["end_offset"]) + overlap = max(0, overlap_end - overlap_start) + + # Assign to page with maximum overlap + if overlap > max_overlap: + max_overlap = overlap + assigned_page = boundary["page"] + + if assigned_page is not None: + chunk.page_number = assigned_page + + async def processor_task( worker_id: int, receive_stream: MemoryObjectReceiveStream[DocumentTask], @@ -223,6 +260,32 @@ async def _index_document( content = f"{document['title']}\n\n{document['content']}" title = document["title"] etag = document.get("etag", "") + file_metadata = {} # No file-specific metadata for notes + elif doc_task.doc_type == "file": + # For files, doc_id is the file path + file_path = doc_task.doc_id + + # Read file content via WebDAV + content_bytes, content_type = await nc_client.webdav.read_file(file_path) + + # Use document processor registry to extract text + from nextcloud_mcp_server.document_processors import get_registry + + registry = get_registry() + + try: + result = await registry.process( + content=content_bytes, + content_type=content_type, + filename=file_path, + ) + content = result.text + file_metadata = result.metadata + title = file_metadata.get("title") or file_path.split("/")[-1] + etag = "" # WebDAV read_file doesn't return etag + except Exception as e: + logger.error(f"Failed to process file {file_path}: {e}") + raise else: raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}") @@ -231,7 +294,11 @@ async def _index_document( chunk_size=settings.document_chunk_size, overlap=settings.document_chunk_overlap, ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) + + # Assign page numbers to chunks if page boundaries are available (PDFs) + if doc_task.doc_type == "file" and "page_boundaries" in file_metadata: + assign_page_numbers(chunks, file_metadata["page_boundaries"]) # Extract chunk texts for embedding chunk_texts = [chunk.text for chunk in chunks] @@ -242,7 +309,7 @@ async def _index_document( # Generate sparse embeddings (BM25 for keyword matching) bm25_service = get_bm25_service() - sparse_embeddings = bm25_service.encode_batch(chunk_texts) + sparse_embeddings = await bm25_service.encode_batch(chunk_texts) # Prepare Qdrant points indexed_at = int(time.time()) @@ -277,6 +344,22 @@ async def _index_document( "chunk_start_offset": chunk.start_offset, "chunk_end_offset": chunk.end_offset, "metadata_version": 2, # v2 includes position metadata + # File-specific metadata (PDF, etc.) + **( + { + "file_path": doc_task.doc_id, + "mime_type": file_metadata.get("content_type", ""), + "file_size": file_metadata.get("file_size"), + "page_number": chunk.page_number, + "page_count": file_metadata.get("page_count"), + "author": file_metadata.get("author"), + "creation_date": file_metadata.get("creation_date"), + "has_images": file_metadata.get("has_images", False), + "image_count": file_metadata.get("image_count", 0), + } + if doc_task.doc_type == "file" + else {} + ), }, ) ) diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py index 9d45a8f..3a1db66 100644 --- a/nextcloud_mcp_server/vector/scanner.py +++ b/nextcloud_mcp_server/vector/scanner.py @@ -4,6 +4,7 @@ Periodically scans enabled users' content and queues changed documents for proce """ import logging +import os import time from dataclasses import dataclass @@ -309,7 +310,148 @@ async def scan_user_documents( ) _potentially_deleted[doc_key] = current_time + # Scan tagged PDF files (after notes) + # Get indexed files from Qdrant (separate query for doc_type="file") + indexed_files = {} + if not initial_sync: + file_scroll_result = await qdrant_client.scroll( + collection_name=settings.get_collection_name(), + scroll_filter=Filter( + must=[ + FieldCondition(key="user_id", match=MatchValue(value=user_id)), + FieldCondition(key="doc_type", match=MatchValue(value="file")), + ] + ), + limit=10000, # Reasonable limit for file count + with_payload=["doc_id", "indexed_at"], + with_vectors=False, + ) + + indexed_files = { + point.payload["doc_id"]: point.payload["indexed_at"] + for point in file_scroll_result[0] + } + + logger.debug(f"Found {len(indexed_files)} indexed files in Qdrant") + + # Scan for tagged PDF files + file_count = 0 + file_queued = 0 + nextcloud_file_paths = set() + + try: + # Find files with vector-index tag using OCS Tags API + settings = get_settings() + tag_name = os.getenv("VECTOR_SYNC_PDF_TAG", "vector-index") + # Use NextcloudClient.find_files_by_tag() which uses proper OCS API + # and filters by PDF MIME type + tagged_files = await nc_client.find_files_by_tag( + tag_name, mime_type_filter="application/pdf" + ) + + for file_info in tagged_files: + # Files are already filtered by MIME type in find_files_by_tag() + file_count += 1 + file_path = file_info["path"] + nextcloud_file_paths.add(file_path) + + # Use last_modified timestamp if available, otherwise use current time + modified_at = file_info.get("last_modified_timestamp", int(time.time())) + if isinstance(file_info.get("last_modified"), str): + # Parse RFC 2822 date format if needed + from email.utils import parsedate_to_datetime + + try: + dt = parsedate_to_datetime(file_info["last_modified"]) + modified_at = int(dt.timestamp()) + except (ValueError, KeyError): + pass + + if initial_sync: + # Send everything on first sync + await send_stream.send( + DocumentTask( + user_id=user_id, + doc_id=file_path, + doc_type="file", + operation="index", + modified_at=modified_at, + ) + ) + file_queued += 1 + else: + # Incremental sync: compare with indexed state + indexed_at = indexed_files.get(file_path) + + # If file reappeared, remove from potentially_deleted + file_key = (user_id, file_path) + if file_key in _potentially_deleted: + logger.debug( + f"File {file_path} reappeared, removing from deletion grace period" + ) + del _potentially_deleted[file_key] + + # Send if never indexed or modified since last index + if indexed_at is None or modified_at > indexed_at: + await send_stream.send( + DocumentTask( + user_id=user_id, + doc_id=file_path, + doc_type="file", + operation="index", + modified_at=modified_at, + ) + ) + file_queued += 1 + + logger.info( + f"[SCAN-{scan_id}] Found {file_count} tagged PDFs for {user_id}" + ) + record_vector_sync_scan(file_count) + + # Check for deleted files (not initial sync) + if not initial_sync: + for file_path in indexed_files: + if file_path not in nextcloud_file_paths: + file_key = (user_id, file_path) + + if file_key in _potentially_deleted: + # Check if grace period elapsed + first_missing_time = _potentially_deleted[file_key] + time_missing = current_time - first_missing_time + + if time_missing >= grace_period: + # Grace period elapsed, send for deletion + logger.info( + f"File {file_path} missing for {time_missing:.1f}s " + f"(>{grace_period:.1f}s grace period), sending deletion" + ) + await send_stream.send( + DocumentTask( + user_id=user_id, + doc_id=file_path, + doc_type="file", + operation="delete", + modified_at=0, + ) + ) + file_queued += 1 + del _potentially_deleted[file_key] + else: + # First time missing, add to grace period tracking + logger.debug( + f"File {file_path} missing for first time, starting grace period" + ) + _potentially_deleted[file_key] = current_time + + except Exception as e: + logger.warning(f"Failed to scan tagged files for {user_id}: {e}") + + queued += file_queued + if queued > 0: - logger.info(f"Sent {queued} documents for incremental sync: {user_id}") + logger.info( + f"Sent {queued} documents ({file_queued} files) for incremental sync: {user_id}" + ) else: logger.debug(f"No changes detected for {user_id}") diff --git a/pyproject.toml b/pyproject.toml index f4bffd1..5481b0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,9 @@ dependencies = [ "python-json-logger>=3.2.0", # Structured JSON logging "jinja2>=3.1.6", "langchain-text-splitters>=1.0.0", + "pymupdf>=1.26.6", + "pymupdf4llm>=0.2.2", + "pymupdf-layout>=1.26.6", ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/tests/integration/test_pdf_indexing.py b/tests/integration/test_pdf_indexing.py new file mode 100644 index 0000000..821ae37 --- /dev/null +++ b/tests/integration/test_pdf_indexing.py @@ -0,0 +1,361 @@ +"""Integration tests for PDF document indexing and semantic search. + +These tests validate the complete PDF processing flow: +1. Process PDF with PyMuPDFProcessor +2. Chunk extracted text with page numbers +3. Index chunks into Qdrant with metadata +4. Perform semantic search on PDF content +5. Verify page numbers and metadata are preserved +""" + +import pymupdf +import pytest +from qdrant_client import AsyncQdrantClient +from qdrant_client.models import Distance, PointStruct, VectorParams + +from nextcloud_mcp_server.document_processors.pymupdf import PyMuPDFProcessor +from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider +from nextcloud_mcp_server.vector.document_chunker import ( + ChunkWithPosition, + RecursiveCharacterTextSplitter, +) + +pytestmark = pytest.mark.integration + + +def create_test_pdf() -> bytes: + """Create a small test PDF with multiple pages.""" + doc = pymupdf.open() + + # Page 1: Introduction + page1 = doc.new_page(width=595, height=842) # A4 size + page1.insert_text( + (50, 50), + "Nextcloud Administration Guide\n\n" + "Chapter 1: Introduction\n\n" + "Nextcloud is a self-hosted file sharing and collaboration platform. " + "It provides secure file storage, sharing, and synchronization across devices. " + "This guide covers installation, configuration, and maintenance of Nextcloud.", + ) + + # Page 2: Installation + page2 = doc.new_page(width=595, height=842) + page2.insert_text( + (50, 50), + "Chapter 2: Installation\n\n" + "System Requirements:\n" + "- PHP 8.0 or higher\n" + "- MySQL 8.0 or MariaDB 10.5\n" + "- Apache or Nginx web server\n\n" + "Installation steps:\n" + "1. Download Nextcloud package\n" + "2. Extract to web server directory\n" + "3. Configure database connection\n" + "4. Run installation wizard", + ) + + # Page 3: Configuration + page3 = doc.new_page(width=595, height=842) + page3.insert_text( + (50, 50), + "Chapter 3: Configuration\n\n" + "Database Configuration:\n" + "Edit config/config.php to set database parameters. " + "Configure database host, username, password, and database name. " + "For optimal performance, use MySQL or MariaDB.\n\n" + "Security Settings:\n" + "Enable HTTPS, configure trusted domains, and set up firewall rules.", + ) + + # Convert to bytes + pdf_bytes = doc.tobytes() + doc.close() + + return pdf_bytes + + +@pytest.fixture +async def simple_embedding_provider(): + """Simple in-process embedding provider for testing.""" + return SimpleEmbeddingProvider(dimension=384) + + +@pytest.fixture +async def qdrant_test_client(): + """Qdrant client for testing (in-memory).""" + client = AsyncQdrantClient(":memory:") + yield client + await client.close() + + +@pytest.fixture +async def test_collection(qdrant_test_client: AsyncQdrantClient): + """Create test collection in Qdrant.""" + collection_name = "test_pdf_indexing" + + # Create collection + await qdrant_test_client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE), + ) + + yield collection_name + + # Cleanup + try: + await qdrant_test_client.delete_collection(collection_name) + except Exception: + pass + + +@pytest.fixture +def pymupdf_processor(): + """PyMuPDF processor for testing (without image extraction).""" + return PyMuPDFProcessor(extract_images=False) + + +async def test_pymupdf_processor_extracts_text_and_metadata(pymupdf_processor): + """Test PyMuPDF processor extracts text and metadata from PDF.""" + pdf_bytes = create_test_pdf() + + result = await pymupdf_processor.process( + content=pdf_bytes, + content_type="application/pdf", + filename="test-admin-guide.pdf", + ) + + # Verify result structure + assert result.success is True + assert result.processor == "pymupdf" + assert result.text is not None + assert len(result.text) > 0 + + # Verify extracted text contains expected content + assert "Nextcloud Administration Guide" in result.text + assert "Chapter 1: Introduction" in result.text + assert "Chapter 2: Installation" in result.text + assert "Chapter 3: Configuration" in result.text + assert "PHP 8.0 or higher" in result.text + assert "MySQL" in result.text + + # Verify metadata + assert result.metadata is not None + assert result.metadata["page_count"] == 3 + assert result.metadata["filename"] == "test-admin-guide.pdf" + assert "format" in result.metadata + + +async def test_document_chunker_preserves_page_numbers(): + """Test that document chunker can handle chunks with page number metadata.""" + # Create chunks with page numbers + chunks = [ + ChunkWithPosition( + text="Chapter 1 content on page 1", + start_offset=0, + end_offset=28, + page_number=1, + ), + ChunkWithPosition( + text="Chapter 2 content on page 2", + start_offset=29, + end_offset=57, + page_number=2, + ), + ChunkWithPosition( + text="Chapter 3 content on page 3", + start_offset=58, + end_offset=86, + page_number=3, + ), + ] + + # Verify page numbers are preserved + assert chunks[0].page_number == 1 + assert chunks[1].page_number == 2 + assert chunks[2].page_number == 3 + + +async def test_pdf_indexing_and_search_flow( + pymupdf_processor: PyMuPDFProcessor, + qdrant_test_client: AsyncQdrantClient, + test_collection: str, + simple_embedding_provider: SimpleEmbeddingProvider, +): + """Test complete PDF indexing and semantic search flow.""" + + # Step 1: Process PDF with PyMuPDF + pdf_bytes = create_test_pdf() + result = await pymupdf_processor.process( + content=pdf_bytes, + content_type="application/pdf", + filename="/Documents/admin-guide.pdf", + ) + + assert result.success is True + assert result.metadata["page_count"] == 3 + + # Step 2: Chunk the extracted text + # Note: In real implementation, we'd track which chunk came from which page + # For this test, we'll simulate by creating chunks manually + splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) + chunks = splitter.split_text(result.text) + + assert len(chunks) > 0 + + # Step 3: Index chunks into Qdrant with PDF metadata + points = [] + for idx, chunk_text in enumerate(chunks): + embedding = await simple_embedding_provider.embed(chunk_text) + + # Simulate page number assignment (in real implementation, this would be tracked) + # For simplicity, assign page based on content + page_number = 1 + if "Chapter 2" in chunk_text or "Installation" in chunk_text: + page_number = 2 + elif "Chapter 3" in chunk_text or "Configuration" in chunk_text: + page_number = 3 + + points.append( + PointStruct( + id=idx, + vector=embedding, + payload={ + "user_id": "admin", + "doc_id": "/Documents/admin-guide.pdf", + "doc_type": "file", + "title": "Nextcloud Administration Guide", + "file_path": "/Documents/admin-guide.pdf", + "mime_type": "application/pdf", + "page_number": page_number, + "page_count": result.metadata["page_count"], + "chunk_index": idx, + "excerpt": chunk_text[:200], + }, + ) + ) + + await qdrant_test_client.upsert( + collection_name=test_collection, points=points, wait=True + ) + + # Step 4: Perform semantic search for installation instructions + query = "how to install Nextcloud system requirements" + query_embedding = await simple_embedding_provider.embed(query) + + response = await qdrant_test_client.query_points( + collection_name=test_collection, + query=query_embedding, + limit=3, + score_threshold=0.0, + ) + + # Verify search results + assert len(response.points) > 0 + + # Top result should be from installation chapter (page 2) + top_result = response.points[0] + assert top_result.payload["doc_type"] == "file" + assert top_result.payload["file_path"] == "/Documents/admin-guide.pdf" + assert ( + "Installation" in top_result.payload["excerpt"] + or top_result.payload["page_number"] == 2 + ) + + # Verify page number is preserved + assert top_result.payload["page_number"] in [1, 2, 3] + assert top_result.payload["page_count"] == 3 + + # Step 5: Search for configuration + query = "database configuration settings MySQL" + query_embedding = await simple_embedding_provider.embed(query) + + response = await qdrant_test_client.query_points( + collection_name=test_collection, + query=query_embedding, + limit=3, + score_threshold=0.0, + ) + + assert len(response.points) > 0 + + # Should find configuration chapter (page 3) + found_config = any( + "Configuration" in r.payload["excerpt"] or r.payload["page_number"] == 3 + for r in response.points[:2] + ) + assert found_config + + +async def test_pdf_search_with_filters( + pymupdf_processor: PyMuPDFProcessor, + qdrant_test_client: AsyncQdrantClient, + test_collection: str, + simple_embedding_provider: SimpleEmbeddingProvider, +): + """Test PDF search with metadata filters.""" + from qdrant_client.models import FieldCondition, Filter, MatchValue + + # Process and index PDF + pdf_bytes = create_test_pdf() + result = await pymupdf_processor.process( + content=pdf_bytes, + content_type="application/pdf", + filename="/Documents/admin-guide.pdf", + ) + + splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) + chunks = splitter.split_text(result.text) + + # Index with metadata + points = [] + for idx, chunk_text in enumerate(chunks): + embedding = await simple_embedding_provider.embed(chunk_text) + + points.append( + PointStruct( + id=idx, + vector=embedding, + payload={ + "user_id": "admin", + "doc_id": "/Documents/admin-guide.pdf", + "doc_type": "file", + "mime_type": "application/pdf", + "excerpt": chunk_text[:200], + }, + ) + ) + + await qdrant_test_client.upsert( + collection_name=test_collection, points=points, wait=True + ) + + # Search with filter for PDFs only + query = "Nextcloud installation" + query_embedding = await simple_embedding_provider.embed(query) + + response = await qdrant_test_client.query_points( + collection_name=test_collection, + query=query_embedding, + query_filter=Filter( + must=[FieldCondition(key="doc_type", match=MatchValue(value="file"))] + ), + limit=3, + ) + + # All results should be from file documents + assert len(response.points) > 0 + for result in response.points: + assert result.payload["doc_type"] == "file" + assert result.payload["mime_type"] == "application/pdf" + + +async def test_pymupdf_health_check(pymupdf_processor: PyMuPDFProcessor): + """Test PyMuPDF processor health check.""" + is_healthy = await pymupdf_processor.health_check() + assert is_healthy is True + + +async def test_pymupdf_supports_pdf_mime_type(pymupdf_processor: PyMuPDFProcessor): + """Test PyMuPDF processor declares PDF support.""" + assert "application/pdf" in pymupdf_processor.supported_mime_types + assert pymupdf_processor.name == "pymupdf" diff --git a/tests/unit/test_document_chunker.py b/tests/unit/test_document_chunker.py index 8a1d90f..66102a7 100644 --- a/tests/unit/test_document_chunker.py +++ b/tests/unit/test_document_chunker.py @@ -9,12 +9,12 @@ from nextcloud_mcp_server.vector.document_chunker import ( class TestDocumentChunkerPositions: """Test suite for DocumentChunker position tracking functionality.""" - def test_single_chunk_simple_text(self): + async def test_single_chunk_simple_text(self): """Test that single-chunk documents return correct positions.""" chunker = DocumentChunker(chunk_size=2048, overlap=200) content = "This is a short document." - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) assert len(chunks) == 1 assert isinstance(chunks[0], ChunkWithPosition) @@ -22,7 +22,7 @@ class TestDocumentChunkerPositions: assert chunks[0].start_offset == 0 assert chunks[0].end_offset == len(content) - def test_multiple_chunks_positions(self): + async def test_multiple_chunks_positions(self): """Test that multi-chunk documents have correct positions.""" # Use small chunk size to force multiple chunks chunker = DocumentChunker(chunk_size=50, overlap=10) @@ -34,7 +34,7 @@ class TestDocumentChunkerPositions: "This is the fourth sentence adding more context." ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Verify we got multiple chunks assert len(chunks) > 1 @@ -61,12 +61,12 @@ class TestDocumentChunkerPositions: extracted = content[chunk.start_offset : chunk.end_offset] assert extracted == chunk.text - def test_chunk_positions_with_whitespace(self): + async def test_chunk_positions_with_whitespace(self): """Test position tracking with various whitespace.""" chunker = DocumentChunker(chunk_size=30, overlap=5) content = "First sentence here. Second sentence.\n\nThird sentence.\tFourth sentence." - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Verify positions correctly handle whitespace for chunk in chunks: @@ -75,19 +75,19 @@ class TestDocumentChunkerPositions: # LangChain strips whitespace by default assert len(chunk.text.strip()) > 0 - def test_empty_content(self): + async def test_empty_content(self): """Test that empty content returns empty chunk.""" chunker = DocumentChunker(chunk_size=2048, overlap=200) content = "" - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) assert len(chunks) == 1 assert chunks[0].text == "" assert chunks[0].start_offset == 0 assert chunks[0].end_offset == 0 - def test_chunk_overlap_positions(self): + async def test_chunk_overlap_positions(self): """Test that overlapping chunks have correct positions.""" chunker = DocumentChunker(chunk_size=50, overlap=15) content = ( @@ -97,7 +97,7 @@ class TestDocumentChunkerPositions: "This is sentence four adding details." ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Verify overlap exists if we have multiple chunks if len(chunks) > 1: @@ -112,14 +112,14 @@ class TestDocumentChunkerPositions: # With overlap, next chunk may start before current ends assert next_chunk.start_offset <= current_chunk.end_offset - def test_unicode_content_positions(self): + async def test_unicode_content_positions(self): """Test position tracking with Unicode characters.""" chunker = DocumentChunker(chunk_size=50, overlap=10) content = ( "Hello 世界. こんにちは there. мир Привет world. שלום مرحبا 你好 friend." ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Verify all chunks extract correctly for chunk in chunks: @@ -131,7 +131,7 @@ class TestDocumentChunkerPositions: assert chunks[0].start_offset == 0 assert chunks[0].end_offset == len(content) - def test_realistic_note_content(self): + async def test_realistic_note_content(self): """Test with realistic note content similar to Nextcloud Notes.""" chunker = DocumentChunker(chunk_size=200, overlap=50) content = """My Project Notes @@ -152,7 +152,7 @@ position tracking for each chunk. This allows us to highlight the exact chunk that matched a search query, which builds trust in the RAG system.""" - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Should have multiple chunks assert len(chunks) > 1 @@ -168,7 +168,7 @@ which builds trust in the RAG system.""" assert chunk.end_offset <= len(content) assert chunk.start_offset < chunk.end_offset - def test_semantic_boundary_preservation(self): + async def test_semantic_boundary_preservation(self): """Test that LangChain creates semantically coherent chunks.""" chunker = DocumentChunker(chunk_size=100, overlap=20) content = ( @@ -178,7 +178,7 @@ which builds trust in the RAG system.""" "Fourth sentence ends." ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Verify all chunks are extractable using their positions for chunk in chunks: @@ -193,7 +193,7 @@ which builds trust in the RAG system.""" assert chunk.end_offset <= len(content) assert chunk.start_offset < chunk.end_offset - def test_paragraph_boundary_preservation(self): + async def test_paragraph_boundary_preservation(self): """Test that LangChain preserves paragraph boundaries.""" chunker = DocumentChunker(chunk_size=80, overlap=15) content = """First paragraph here. @@ -204,7 +204,7 @@ Third paragraph here. Fourth paragraph here.""" - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # LangChain should prefer splitting at paragraph boundaries (\n\n) # Verify we got multiple chunks @@ -215,7 +215,7 @@ Fourth paragraph here.""" extracted = content[chunk.start_offset : chunk.end_offset] assert extracted == chunk.text - def test_default_parameters(self): + async def test_default_parameters(self): """Test that default parameters work correctly.""" chunker = DocumentChunker() # Use defaults: 2048 chars, 200 overlap @@ -224,14 +224,14 @@ Fourth paragraph here.""" "This is a short note with a few sentences. It should fit in one chunk." ) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) assert len(chunks) == 1 assert chunks[0].text == content assert chunks[0].start_offset == 0 assert chunks[0].end_offset == len(content) - def test_large_document_chunking(self): + async def test_large_document_chunking(self): """Test chunking of a large document.""" chunker = DocumentChunker(chunk_size=100, overlap=20) @@ -244,7 +244,7 @@ Fourth paragraph here.""" ] content = "\n\n".join(paragraphs) - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) # Should create multiple chunks assert len(chunks) > 1 @@ -261,12 +261,12 @@ Fourth paragraph here.""" assert chunks[0].start_offset == 0 assert chunks[-1].end_offset == len(content) - def test_position_tracking_with_overlap(self): + async def test_position_tracking_with_overlap(self): """Test that position tracking works correctly with overlap.""" chunker = DocumentChunker(chunk_size=50, overlap=15) content = "A" * 25 + ". " + "B" * 25 + ". " + "C" * 25 + ". " + "D" * 25 + "." - chunks = chunker.chunk_text(content) + chunks = await chunker.chunk_text(content) if len(chunks) > 1: # Verify overlap creates correct positions diff --git a/uv.lock b/uv.lock index ef66995..75307d2 100644 --- a/uv.lock +++ b/uv.lock @@ -1925,6 +1925,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" }, ] +[[package]] +name = "networkx" +version = "3.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, +] + [[package]] name = "nextcloud-mcp-server" version = "0.44.0" @@ -1952,6 +1961,9 @@ dependencies = [ { name = "prometheus-client" }, { name = "pydantic" }, { name = "pyjwt", extra = ["crypto"] }, + { name = "pymupdf" }, + { name = "pymupdf-layout" }, + { name = "pymupdf4llm" }, { name = "python-json-logger" }, { name = "pythonvcard4" }, { name = "qdrant-client" }, @@ -1997,6 +2009,9 @@ requires-dist = [ { name = "prometheus-client", specifier = ">=0.21.0" }, { name = "pydantic", specifier = ">=2.11.4" }, { name = "pyjwt", extras = ["crypto"], specifier = ">=2.8.0" }, + { name = "pymupdf", specifier = ">=1.26.6" }, + { name = "pymupdf-layout", specifier = ">=1.26.6" }, + { name = "pymupdf4llm", specifier = ">=0.2.2" }, { name = "python-json-logger", specifier = ">=3.2.0" }, { name = "pythonvcard4", specifier = ">=0.2.0" }, { name = "qdrant-client", specifier = ">=1.7.0" }, @@ -2969,6 +2984,52 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymupdf" +version = "1.26.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/d7/a6f0e03a117fa2ad79c4b898203bb212b17804f92558a6a339298faca7bb/pymupdf-1.26.6.tar.gz", hash = "sha256:a2b4531cd4ab36d6f1f794bb6d3c33b49bda22f36d58bb1f3e81cbc10183bd2b", size = 84322494, upload-time = "2025-11-05T15:20:46.786Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/5c/dec354eee5fe4966c715f33818ed4193e0e6c986cf8484de35b6c167fb8e/pymupdf-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:e46f320a136ad55e5219e8f0f4061bdf3e4c12b126d2740d5a49f73fae7ea176", size = 23178988, upload-time = "2025-11-05T14:31:19.834Z" }, + { url = "https://files.pythonhosted.org/packages/ec/a0/11adb742d18142bd623556cd3b5d64649816decc5eafd30efc9498657e76/pymupdf-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:6844cd2396553c0fa06de4869d5d5ecb1260e6fc3b9d85abe8fa35f14dd9d688", size = 22469764, upload-time = "2025-11-05T14:32:34.654Z" }, + { url = "https://files.pythonhosted.org/packages/e4/c8/377cf20e31f58d4c243bfcf2d3cb7466d5b97003b10b9f1161f11eb4a994/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:617ba69e02c44f0da1c0e039ea4a26cf630849fd570e169c71daeb8ac52a81d6", size = 23502227, upload-time = "2025-11-06T11:03:56.934Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/6e02e3d84b32c137c71a0a3dcdba8f2f6e9950619a3bc272245c7c06a051/pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:7777d0b7124c2ebc94849536b6a1fb85d158df3b9d873935e63036559391534c", size = 24115381, upload-time = "2025-11-05T14:33:54.338Z" }, + { url = "https://files.pythonhosted.org/packages/ab/9d/30f7fcb3776bfedde66c06297960debe4883b1667294a1ee9426c942e94d/pymupdf-1.26.6-cp310-abi3-win32.whl", hash = "sha256:8f3ef05befc90ca6bb0f12983200a7048d5bff3e1c1edef1bb3de60b32cb5274", size = 17203613, upload-time = "2025-11-05T17:19:47.494Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e8/989f4eaa369c7166dc24f0eaa3023f13788c40ff1b96701f7047421554a8/pymupdf-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:ce02ca96ed0d1acfd00331a4d41a34c98584d034155b06fd4ec0f051718de7ba", size = 18405680, upload-time = "2025-11-05T14:34:48.672Z" }, +] + +[[package]] +name = "pymupdf-layout" +version = "1.26.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "networkx" }, + { name = "numpy" }, + { name = "onnxruntime" }, + { name = "pymupdf" }, + { name = "pyyaml" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/70/86/31f8d05b36ebf43cca88d5c6415de46eb748e487b618a589671a610be8c8/pymupdf_layout-1.26.6-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:d632f83208db8b24600eb8ac54d3135fab6ab1f251a38fa6061e7470e81b9481", size = 12727222, upload-time = "2025-11-05T14:35:44.367Z" }, + { url = "https://files.pythonhosted.org/packages/ff/d3/0e52d7d1e2f975843f5354ac3b210a98471b690105efc332d3c285bd794b/pymupdf_layout-1.26.6-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:f1d45f72ec08ef7f644928487e7a067df6df63172d682d0bb05158896d0d9c71", size = 12725266, upload-time = "2025-11-05T14:36:50.727Z" }, + { url = "https://files.pythonhosted.org/packages/ae/49/ad1a5edccc45477493d6a53a41df7620d6147febb897c3dd8354f413e154/pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0561b9485a6ac1a40bb1e2ec7a1648aa64e4be56dab2f39182b11a69e3e43024", size = 12732580, upload-time = "2025-11-06T11:04:09.065Z" }, + { url = "https://files.pythonhosted.org/packages/a7/bd/3e049b359dd0c3a101ae915484b87ff73bfdedfb24a924e0a8e6783b33f3/pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:ee8e2bfed12d4b6421b27a1f89837ac09d8bc3f783f79670db397ec24614bf3d", size = 12732539, upload-time = "2025-11-05T14:38:01.244Z" }, + { url = "https://files.pythonhosted.org/packages/f8/7a/69078bf16669f8361360321ea6bede4cbfede35bf3f4ca5842a7c2387825/pymupdf_layout-1.26.6-cp310-abi3-win_amd64.whl", hash = "sha256:2305aac24fd6e12217afaaea8ec95be297be9b250b6077a3f4e92f7f9beeaf92", size = 12734904, upload-time = "2025-11-05T14:39:05.83Z" }, +] + +[[package]] +name = "pymupdf4llm" +version = "0.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pymupdf" }, + { name = "tabulate" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/26/e1226c5329d0c901cd42649e4e8d7544636524c31e95a84f4dcf7c25731d/pymupdf4llm-0.2.2.tar.gz", hash = "sha256:d8dee8451e31ec39daf691687403bf2a98ac7e7b8709400a4e13a582eab835c6", size = 59501, upload-time = "2025-11-17T11:10:20.204Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/23/08be1528f3ccb8c245e9a7b247255d6853a8e162b1451f4888f2006c52f0/pymupdf4llm-0.2.2-py3-none-any.whl", hash = "sha256:e7777d083f5f7c7daa804c3423804c309a7e096d682773c01e9dd4bb060f4a56", size = 62063, upload-time = "2025-11-17T11:10:22.452Z" }, +] + [[package]] name = "pyreadline3" version = "3.5.4" @@ -3553,6 +3614,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" From 327d843f643dfad9416f334a983d04ff67a31929 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 20 Nov 2025 11:22:20 +0100 Subject: [PATCH 02/19] feat: Implement per-chunk vector visualization with context expansion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major improvements to vector visualization page: - Refactor PCA to display individual chunks instead of averaged documents - Add context expansion module for fetching surrounding text from notes and PDFs - Update deduplication to use (doc_id, doc_type, chunk_start, chunk_end) keys - Fix Alpine.js rendering with chunk-specific keys including offsets - Refactor authentication helper to return NextcloudClient for better reuse - Add async context manager support to NextcloudClient Technical details: - viz_routes.py: Fetch specific chunk vectors instead of averaging per document - context.py: New module supporting both notes and PDF text extraction via PyMuPDF - search algorithms: Extract page_number, chunk_index, total_chunks from Qdrant - vector-viz.js/html: Use chunk positions in expansion tracking keys This enables users to see which specific chunks match their query and view them with surrounding context in the PCA visualization. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../auth/static/vector-viz.js | 2 +- .../auth/templates/vector_viz.html | 14 +- nextcloud_mcp_server/auth/userinfo_routes.py | 45 +-- nextcloud_mcp_server/auth/viz_routes.py | 251 ++++++++--------- nextcloud_mcp_server/client/__init__.py | 9 + nextcloud_mcp_server/models/semantic.py | 23 ++ nextcloud_mcp_server/search/algorithms.py | 6 + nextcloud_mcp_server/search/bm25_hybrid.py | 21 +- nextcloud_mcp_server/search/context.py | 265 ++++++++++++++++++ nextcloud_mcp_server/search/semantic.py | 21 +- 10 files changed, 485 insertions(+), 172 deletions(-) create mode 100644 nextcloud_mcp_server/search/context.py diff --git a/nextcloud_mcp_server/auth/static/vector-viz.js b/nextcloud_mcp_server/auth/static/vector-viz.js index c01c1e7..f59f46c 100644 --- a/nextcloud_mcp_server/auth/static/vector-viz.js +++ b/nextcloud_mcp_server/auth/static/vector-viz.js @@ -217,7 +217,7 @@ function vizApp() { }, async toggleChunk(result) { - const resultKey = `${result.doc_type}_${result.id}`; + const resultKey = `${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`; if (this.isChunkExpanded(resultKey)) { delete this.expandedChunks[resultKey]; diff --git a/nextcloud_mcp_server/auth/templates/vector_viz.html b/nextcloud_mcp_server/auth/templates/vector_viz.html index a052d9e..214b582 100644 --- a/nextcloud_mcp_server/auth/templates/vector_viz.html +++ b/nextcloud_mcp_server/auth/templates/vector_viz.html @@ -117,7 +117,7 @@