From d0691d5aa0b16f654bc6595938f69691748494ed Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 20 Nov 2025 12:00:47 +0100 Subject: [PATCH] feat: Switch files to use numeric IDs with file_path resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scanner.py: Use file_info['id'] as doc_id instead of file_path - scanner.py: Pass file_path in DocumentTask for content retrieval - processor.py: Store file_path in Qdrant payload for later lookup - context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path - context.py: Update get_chunk_with_context() to handle file ID resolution This makes the system resilient to file renames since file IDs are stable identifiers in Nextcloud, while file paths can change. --- nextcloud_mcp_server/search/context.py | 80 +++++++++++++++++++++++- nextcloud_mcp_server/vector/processor.py | 11 +++- nextcloud_mcp_server/vector/scanner.py | 34 +++++----- 3 files changed, 105 insertions(+), 20 deletions(-) diff --git a/nextcloud_mcp_server/search/context.py b/nextcloud_mcp_server/search/context.py index 9f1b2d7..bc1b8e9 100644 --- a/nextcloud_mcp_server/search/context.py +++ b/nextcloud_mcp_server/search/context.py @@ -12,6 +12,68 @@ from nextcloud_mcp_server.client import NextcloudClient logger = logging.getLogger(__name__) +async def _get_file_path_from_qdrant( + user_id: str, file_id: int, chunk_start: int, chunk_end: int +) -> str | None: + """Resolve file_id to file_path by querying Qdrant payload. + + Args: + user_id: User ID who owns the file + file_id: Numeric file ID + chunk_start: Character offset where chunk starts + chunk_end: Character offset where chunk ends + + Returns: + File path string, or None if not found in Qdrant + """ + try: + from qdrant_client.models import FieldCondition, Filter, MatchValue + + from nextcloud_mcp_server.config import get_settings + from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client + + qdrant_client = await get_qdrant_client() + settings = get_settings() + + # Query for the specific chunk + scroll_result = await qdrant_client.scroll( + collection_name=settings.get_collection_name(), + scroll_filter=Filter( + must=[ + FieldCondition(key="user_id", match=MatchValue(value=user_id)), + FieldCondition(key="doc_id", match=MatchValue(value=file_id)), + FieldCondition(key="doc_type", match=MatchValue(value="file")), + FieldCondition( + key="chunk_start_offset", match=MatchValue(value=chunk_start) + ), + FieldCondition( + key="chunk_end_offset", match=MatchValue(value=chunk_end) + ), + ] + ), + limit=1, + with_payload=["file_path"], + with_vectors=False, + ) + + if scroll_result[0]: + point = scroll_result[0][0] + file_path = point.payload.get("file_path") + if file_path: + logger.debug(f"Resolved file_id {file_id} to file_path {file_path}") + return str(file_path) + + logger.warning( + f"Could not find file_path in Qdrant for file_id {file_id}, " + f"chunk [{chunk_start}:{chunk_end}]" + ) + return None + + except Exception as e: + logger.error(f"Error querying Qdrant for file_path: {e}", exc_info=True) + return None + + @dataclass class ChunkContext: """Expanded chunk with surrounding context and position markers. @@ -64,7 +126,7 @@ async def get_chunk_with_context( Args: nc_client: Authenticated Nextcloud client user_id: User ID who owns the document - doc_id: Document ID (note ID or file path) + doc_id: Document ID (int for notes/files) doc_type: Type of document ("note", "file", etc.) chunk_start: Character offset where chunk starts chunk_end: Character offset where chunk ends @@ -77,8 +139,22 @@ async def get_chunk_with_context( ChunkContext with expanded context and markers, or None if document cannot be retrieved """ + # For files, retrieve file_path from Qdrant payload + resolved_doc_id = doc_id + if doc_type == "file" and isinstance(doc_id, int): + file_path = await _get_file_path_from_qdrant( + user_id, doc_id, chunk_start, chunk_end + ) + if not file_path: + logger.warning( + f"Could not resolve file_id {doc_id} to file_path from Qdrant" + ) + return None + resolved_doc_id = file_path + logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}") + # Fetch full document text - full_text = await _fetch_document_text(nc_client, doc_id, doc_type) + full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type) if full_text is None: logger.warning( f"Could not fetch document text for {doc_type} {doc_id}, " diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py index a4fe100..63b2190 100644 --- a/nextcloud_mcp_server/vector/processor.py +++ b/nextcloud_mcp_server/vector/processor.py @@ -261,9 +261,14 @@ async def _index_document( title = document["title"] etag = document.get("etag", "") file_metadata = {} # No file-specific metadata for notes + file_path = None # Notes don't have file paths elif doc_task.doc_type == "file": - # For files, doc_id is the file path - file_path = doc_task.doc_id + # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask + if not doc_task.file_path: + raise ValueError( + f"File path required for file indexing but not provided (file_id={doc_task.doc_id})" + ) + file_path = doc_task.file_path # Read file content via WebDAV content_bytes, content_type = await nc_client.webdav.read_file(file_path) @@ -347,7 +352,7 @@ async def _index_document( # File-specific metadata (PDF, etc.) **( { - "file_path": doc_task.doc_id, + "file_path": file_path, # Store file path for retrieval "mime_type": file_metadata.get("content_type", ""), "file_size": file_metadata.get("file_size"), "page_number": chunk.page_number, diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py index 3a1db66..a41277e 100644 --- a/nextcloud_mcp_server/vector/scanner.py +++ b/nextcloud_mcp_server/vector/scanner.py @@ -27,10 +27,11 @@ class DocumentTask: """Document task for processing queue.""" user_id: str - doc_id: str + doc_id: int | str # int for files/notes, str for legacy doc_type: str # "note", "file", "calendar" operation: str # "index" or "delete" modified_at: int + file_path: str | None = None # File path for files (when doc_id is file_id) # Track documents potentially deleted (grace period before actual deletion) @@ -337,7 +338,7 @@ async def scan_user_documents( # Scan for tagged PDF files file_count = 0 file_queued = 0 - nextcloud_file_paths = set() + nextcloud_file_ids = set() try: # Find files with vector-index tag using OCS Tags API @@ -352,8 +353,9 @@ async def scan_user_documents( for file_info in tagged_files: # Files are already filtered by MIME type in find_files_by_tag() file_count += 1 - file_path = file_info["path"] - nextcloud_file_paths.add(file_path) + file_id = file_info["id"] # Use numeric file ID, not path + file_path = file_info["path"] # Keep path for logging + nextcloud_file_ids.add(file_id) # Use last_modified timestamp if available, otherwise use current time modified_at = file_info.get("last_modified_timestamp", int(time.time())) @@ -372,22 +374,23 @@ async def scan_user_documents( await send_stream.send( DocumentTask( user_id=user_id, - doc_id=file_path, + doc_id=file_id, # Use numeric file ID doc_type="file", operation="index", modified_at=modified_at, + file_path=file_path, # Pass file path for content retrieval ) ) file_queued += 1 else: # Incremental sync: compare with indexed state - indexed_at = indexed_files.get(file_path) + indexed_at = indexed_files.get(file_id) # If file reappeared, remove from potentially_deleted - file_key = (user_id, file_path) + file_key = (user_id, file_id) if file_key in _potentially_deleted: logger.debug( - f"File {file_path} reappeared, removing from deletion grace period" + f"File {file_path} (ID: {file_id}) reappeared, removing from deletion grace period" ) del _potentially_deleted[file_key] @@ -396,10 +399,11 @@ async def scan_user_documents( await send_stream.send( DocumentTask( user_id=user_id, - doc_id=file_path, + doc_id=file_id, # Use numeric file ID doc_type="file", operation="index", modified_at=modified_at, + file_path=file_path, # Pass file path for content retrieval ) ) file_queued += 1 @@ -411,9 +415,9 @@ async def scan_user_documents( # Check for deleted files (not initial sync) if not initial_sync: - for file_path in indexed_files: - if file_path not in nextcloud_file_paths: - file_key = (user_id, file_path) + for file_id in indexed_files: + if file_id not in nextcloud_file_ids: + file_key = (user_id, file_id) if file_key in _potentially_deleted: # Check if grace period elapsed @@ -423,13 +427,13 @@ async def scan_user_documents( if time_missing >= grace_period: # Grace period elapsed, send for deletion logger.info( - f"File {file_path} missing for {time_missing:.1f}s " + f"File ID {file_id} missing for {time_missing:.1f}s " f"(>{grace_period:.1f}s grace period), sending deletion" ) await send_stream.send( DocumentTask( user_id=user_id, - doc_id=file_path, + doc_id=file_id, # Use numeric file ID doc_type="file", operation="delete", modified_at=0, @@ -440,7 +444,7 @@ async def scan_user_documents( else: # First time missing, add to grace period tracking logger.debug( - f"File {file_path} missing for first time, starting grace period" + f"File ID {file_id} missing for first time, starting grace period" ) _potentially_deleted[file_key] = current_time