From d0691d5aa0b16f654bc6595938f69691748494ed Mon Sep 17 00:00:00 2001
From: Chris Coutinho <chris@coutinho.io>
Date: Thu, 20 Nov 2025 12:00:47 +0100
Subject: [PATCH] feat: Switch files to use numeric IDs with file_path
 resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- scanner.py: Use file_info['id'] as doc_id instead of file_path
- scanner.py: Pass file_path in DocumentTask for content retrieval
- processor.py: Store file_path in Qdrant payload for later lookup
- context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path
- context.py: Update get_chunk_with_context() to handle file ID resolution

This makes the system resilient to file renames since file IDs are stable
identifiers in Nextcloud, while file paths can change.
---
 nextcloud_mcp_server/search/context.py   | 80 +++++++++++++++++++++++-
 nextcloud_mcp_server/vector/processor.py | 11 +++-
 nextcloud_mcp_server/vector/scanner.py   | 34 +++++-----
 3 files changed, 105 insertions(+), 20 deletions(-)

diff --git a/nextcloud_mcp_server/search/context.py b/nextcloud_mcp_server/search/context.py
index 9f1b2d7..bc1b8e9 100644
--- a/nextcloud_mcp_server/search/context.py
+++ b/nextcloud_mcp_server/search/context.py
@@ -12,6 +12,68 @@ from nextcloud_mcp_server.client import NextcloudClient
 logger = logging.getLogger(__name__)
 
 
+async def _get_file_path_from_qdrant(
+    user_id: str, file_id: int, chunk_start: int, chunk_end: int
+) -> str | None:
+    """Resolve file_id to file_path by querying Qdrant payload.
+
+    Args:
+        user_id: User ID who owns the file
+        file_id: Numeric file ID
+        chunk_start: Character offset where chunk starts
+        chunk_end: Character offset where chunk ends
+
+    Returns:
+        File path string, or None if not found in Qdrant
+    """
+    try:
+        from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+        from nextcloud_mcp_server.config import get_settings
+        from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
+
+        qdrant_client = await get_qdrant_client()
+        settings = get_settings()
+
+        # Query for the specific chunk
+        scroll_result = await qdrant_client.scroll(
+            collection_name=settings.get_collection_name(),
+            scroll_filter=Filter(
+                must=[
+                    FieldCondition(key="user_id", match=MatchValue(value=user_id)),
+                    FieldCondition(key="doc_id", match=MatchValue(value=file_id)),
+                    FieldCondition(key="doc_type", match=MatchValue(value="file")),
+                    FieldCondition(
+                        key="chunk_start_offset", match=MatchValue(value=chunk_start)
+                    ),
+                    FieldCondition(
+                        key="chunk_end_offset", match=MatchValue(value=chunk_end)
+                    ),
+                ]
+            ),
+            limit=1,
+            with_payload=["file_path"],
+            with_vectors=False,
+        )
+
+        if scroll_result[0]:
+            point = scroll_result[0][0]
+            file_path = point.payload.get("file_path")
+            if file_path:
+                logger.debug(f"Resolved file_id {file_id} to file_path {file_path}")
+                return str(file_path)
+
+        logger.warning(
+            f"Could not find file_path in Qdrant for file_id {file_id}, "
+            f"chunk [{chunk_start}:{chunk_end}]"
+        )
+        return None
+
+    except Exception as e:
+        logger.error(f"Error querying Qdrant for file_path: {e}", exc_info=True)
+        return None
+
+
 @dataclass
 class ChunkContext:
     """Expanded chunk with surrounding context and position markers.
@@ -64,7 +126,7 @@ async def get_chunk_with_context(
     Args:
         nc_client: Authenticated Nextcloud client
         user_id: User ID who owns the document
-        doc_id: Document ID (note ID or file path)
+        doc_id: Document ID (int for notes/files)
         doc_type: Type of document ("note", "file", etc.)
         chunk_start: Character offset where chunk starts
         chunk_end: Character offset where chunk ends
@@ -77,8 +139,22 @@ async def get_chunk_with_context(
         ChunkContext with expanded context and markers, or None if document
         cannot be retrieved
     """
+    # For files, retrieve file_path from Qdrant payload
+    resolved_doc_id = doc_id
+    if doc_type == "file" and isinstance(doc_id, int):
+        file_path = await _get_file_path_from_qdrant(
+            user_id, doc_id, chunk_start, chunk_end
+        )
+        if not file_path:
+            logger.warning(
+                f"Could not resolve file_id {doc_id} to file_path from Qdrant"
+            )
+            return None
+        resolved_doc_id = file_path
+        logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}")
+
     # Fetch full document text
-    full_text = await _fetch_document_text(nc_client, doc_id, doc_type)
+    full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type)
     if full_text is None:
         logger.warning(
             f"Could not fetch document text for {doc_type} {doc_id}, "
diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py
index a4fe100..63b2190 100644
--- a/nextcloud_mcp_server/vector/processor.py
+++ b/nextcloud_mcp_server/vector/processor.py
@@ -261,9 +261,14 @@ async def _index_document(
         title = document["title"]
         etag = document.get("etag", "")
         file_metadata = {}  # No file-specific metadata for notes
+        file_path = None  # Notes don't have file paths
     elif doc_task.doc_type == "file":
-        # For files, doc_id is the file path
-        file_path = doc_task.doc_id
+        # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
+        if not doc_task.file_path:
+            raise ValueError(
+                f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
+            )
+        file_path = doc_task.file_path
 
         # Read file content via WebDAV
         content_bytes, content_type = await nc_client.webdav.read_file(file_path)
@@ -347,7 +352,7 @@ async def _index_document(
                     # File-specific metadata (PDF, etc.)
                     **(
                         {
-                            "file_path": doc_task.doc_id,
+                            "file_path": file_path,  # Store file path for retrieval
                             "mime_type": file_metadata.get("content_type", ""),
                             "file_size": file_metadata.get("file_size"),
                             "page_number": chunk.page_number,
diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py
index 3a1db66..a41277e 100644
--- a/nextcloud_mcp_server/vector/scanner.py
+++ b/nextcloud_mcp_server/vector/scanner.py
@@ -27,10 +27,11 @@ class DocumentTask:
     """Document task for processing queue."""
 
     user_id: str
-    doc_id: str
+    doc_id: int | str  # int for files/notes, str for legacy
     doc_type: str  # "note", "file", "calendar"
     operation: str  # "index" or "delete"
     modified_at: int
+    file_path: str | None = None  # File path for files (when doc_id is file_id)
 
 
 # Track documents potentially deleted (grace period before actual deletion)
@@ -337,7 +338,7 @@ async def scan_user_documents(
         # Scan for tagged PDF files
         file_count = 0
         file_queued = 0
-        nextcloud_file_paths = set()
+        nextcloud_file_ids = set()
 
         try:
             # Find files with vector-index tag using OCS Tags API
@@ -352,8 +353,9 @@ async def scan_user_documents(
             for file_info in tagged_files:
                 # Files are already filtered by MIME type in find_files_by_tag()
                 file_count += 1
-                file_path = file_info["path"]
-                nextcloud_file_paths.add(file_path)
+                file_id = file_info["id"]  # Use numeric file ID, not path
+                file_path = file_info["path"]  # Keep path for logging
+                nextcloud_file_ids.add(file_id)
 
                 # Use last_modified timestamp if available, otherwise use current time
                 modified_at = file_info.get("last_modified_timestamp", int(time.time()))
@@ -372,22 +374,23 @@ async def scan_user_documents(
                     await send_stream.send(
                         DocumentTask(
                             user_id=user_id,
-                            doc_id=file_path,
+                            doc_id=file_id,  # Use numeric file ID
                             doc_type="file",
                             operation="index",
                             modified_at=modified_at,
+                            file_path=file_path,  # Pass file path for content retrieval
                         )
                     )
                     file_queued += 1
                 else:
                     # Incremental sync: compare with indexed state
-                    indexed_at = indexed_files.get(file_path)
+                    indexed_at = indexed_files.get(file_id)
 
                     # If file reappeared, remove from potentially_deleted
-                    file_key = (user_id, file_path)
+                    file_key = (user_id, file_id)
                     if file_key in _potentially_deleted:
                         logger.debug(
-                            f"File {file_path} reappeared, removing from deletion grace period"
+                            f"File {file_path} (ID: {file_id}) reappeared, removing from deletion grace period"
                         )
                         del _potentially_deleted[file_key]
 
@@ -396,10 +399,11 @@ async def scan_user_documents(
                         await send_stream.send(
                             DocumentTask(
                                 user_id=user_id,
-                                doc_id=file_path,
+                                doc_id=file_id,  # Use numeric file ID
                                 doc_type="file",
                                 operation="index",
                                 modified_at=modified_at,
+                                file_path=file_path,  # Pass file path for content retrieval
                             )
                         )
                         file_queued += 1
@@ -411,9 +415,9 @@ async def scan_user_documents(
 
             # Check for deleted files (not initial sync)
             if not initial_sync:
-                for file_path in indexed_files:
-                    if file_path not in nextcloud_file_paths:
-                        file_key = (user_id, file_path)
+                for file_id in indexed_files:
+                    if file_id not in nextcloud_file_ids:
+                        file_key = (user_id, file_id)
 
                         if file_key in _potentially_deleted:
                             # Check if grace period elapsed
@@ -423,13 +427,13 @@ async def scan_user_documents(
                             if time_missing >= grace_period:
                                 # Grace period elapsed, send for deletion
                                 logger.info(
-                                    f"File {file_path} missing for {time_missing:.1f}s "
+                                    f"File ID {file_id} missing for {time_missing:.1f}s "
                                     f"(>{grace_period:.1f}s grace period), sending deletion"
                                 )
                                 await send_stream.send(
                                     DocumentTask(
                                         user_id=user_id,
-                                        doc_id=file_path,
+                                        doc_id=file_id,  # Use numeric file ID
                                         doc_type="file",
                                         operation="delete",
                                         modified_at=0,
@@ -440,7 +444,7 @@ async def scan_user_documents(
                         else:
                             # First time missing, add to grace period tracking
                             logger.debug(
-                                f"File {file_path} missing for first time, starting grace period"
+                                f"File ID {file_id} missing for first time, starting grace period"
                             )
                             _potentially_deleted[file_key] = current_time