feat: Switch files to use numeric IDs with file_path resolution

- scanner.py: Use file_info['id'] as doc_id instead of file_path - scanner.py: Pass file_path in DocumentTask for content retrieval - processor.py: Store file_path in Qdrant payload for later lookup - context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path - context.py: Update get_chunk_with_context() to handle file ID resolution This makes the system resilient to file renames since file IDs are stable identifiers in Nextcloud, while file paths can change.
2025-11-20 12:00:47 +01:00
parent f1610bbd2e
commit d0691d5aa0
3 changed files with 105 additions and 20 deletions
@@ -12,6 +12,68 @@ from nextcloud_mcp_server.client import NextcloudClient
 logger = logging.getLogger(__name__)


+async def _get_file_path_from_qdrant(
+    user_id: str, file_id: int, chunk_start: int, chunk_end: int
+) -> str | None:
+    """Resolve file_id to file_path by querying Qdrant payload.
+
+    Args:
+        user_id: User ID who owns the file
+        file_id: Numeric file ID
+        chunk_start: Character offset where chunk starts
+        chunk_end: Character offset where chunk ends
+
+    Returns:
+        File path string, or None if not found in Qdrant
+    """
+    try:
+        from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+        from nextcloud_mcp_server.config import get_settings
+        from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
+
+        qdrant_client = await get_qdrant_client()
+        settings = get_settings()
+
+        # Query for the specific chunk
+        scroll_result = await qdrant_client.scroll(
+            collection_name=settings.get_collection_name(),
+            scroll_filter=Filter(
+                must=[
+                    FieldCondition(key="user_id", match=MatchValue(value=user_id)),
+                    FieldCondition(key="doc_id", match=MatchValue(value=file_id)),
+                    FieldCondition(key="doc_type", match=MatchValue(value="file")),
+                    FieldCondition(
+                        key="chunk_start_offset", match=MatchValue(value=chunk_start)
+                    ),
+                    FieldCondition(
+                        key="chunk_end_offset", match=MatchValue(value=chunk_end)
+                    ),
+                ]
+            ),
+            limit=1,
+            with_payload=["file_path"],
+            with_vectors=False,
+        )
+
+        if scroll_result[0]:
+            point = scroll_result[0][0]
+            file_path = point.payload.get("file_path")
+            if file_path:
+                logger.debug(f"Resolved file_id {file_id} to file_path {file_path}")
+                return str(file_path)
+
+        logger.warning(
+            f"Could not find file_path in Qdrant for file_id {file_id}, "
+            f"chunk [{chunk_start}:{chunk_end}]"
+        )
+        return None
+
+    except Exception as e:
+        logger.error(f"Error querying Qdrant for file_path: {e}", exc_info=True)
+        return None
+
+
@dataclass
 class ChunkContext:
    """Expanded chunk with surrounding context and position markers.
@@ -64,7 +126,7 @@ async def get_chunk_with_context(
    Args:
        nc_client: Authenticated Nextcloud client
        user_id: User ID who owns the document
-        doc_id: Document ID (note ID or file path)
+        doc_id: Document ID (int for notes/files)
        doc_type: Type of document ("note", "file", etc.)
        chunk_start: Character offset where chunk starts
        chunk_end: Character offset where chunk ends
@@ -77,8 +139,22 @@ async def get_chunk_with_context(
        ChunkContext with expanded context and markers, or None if document
        cannot be retrieved
    """
+    # For files, retrieve file_path from Qdrant payload
+    resolved_doc_id = doc_id
+    if doc_type == "file" and isinstance(doc_id, int):
+        file_path = await _get_file_path_from_qdrant(
+            user_id, doc_id, chunk_start, chunk_end
+        )
+        if not file_path:
+            logger.warning(
+                f"Could not resolve file_id {doc_id} to file_path from Qdrant"
+            )
+            return None
+        resolved_doc_id = file_path
+        logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}")
+
    # Fetch full document text
-    full_text = await _fetch_document_text(nc_client, doc_id, doc_type)
+    full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type)
    if full_text is None:
        logger.warning(
            f"Could not fetch document text for {doc_type} {doc_id}, "
@@ -261,9 +261,14 @@ async def _index_document(
        title = document["title"]
        etag = document.get("etag", "")
        file_metadata = {}  # No file-specific metadata for notes
+        file_path = None  # Notes don't have file paths
    elif doc_task.doc_type == "file":
-        # For files, doc_id is the file path
-        file_path = doc_task.doc_id
+        # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
+        if not doc_task.file_path:
+            raise ValueError(
+                f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
+            )
+        file_path = doc_task.file_path

        # Read file content via WebDAV
        content_bytes, content_type = await nc_client.webdav.read_file(file_path)
@@ -347,7 +352,7 @@ async def _index_document(
                    # File-specific metadata (PDF, etc.)
                    **(
                        {
-                            "file_path": doc_task.doc_id,
+                            "file_path": file_path,  # Store file path for retrieval
                            "mime_type": file_metadata.get("content_type", ""),
                            "file_size": file_metadata.get("file_size"),
                            "page_number": chunk.page_number,
@@ -27,10 +27,11 @@ class DocumentTask:
    """Document task for processing queue."""

    user_id: str
-    doc_id: str
+    doc_id: int | str  # int for files/notes, str for legacy
    doc_type: str  # "note", "file", "calendar"
    operation: str  # "index" or "delete"
    modified_at: int
+    file_path: str | None = None  # File path for files (when doc_id is file_id)


 # Track documents potentially deleted (grace period before actual deletion)
@@ -337,7 +338,7 @@ async def scan_user_documents(
        # Scan for tagged PDF files
        file_count = 0
        file_queued = 0
-        nextcloud_file_paths = set()
+        nextcloud_file_ids = set()

        try:
            # Find files with vector-index tag using OCS Tags API
@@ -352,8 +353,9 @@ async def scan_user_documents(
            for file_info in tagged_files:
                # Files are already filtered by MIME type in find_files_by_tag()
                file_count += 1
-                file_path = file_info["path"]
-                nextcloud_file_paths.add(file_path)
+                file_id = file_info["id"]  # Use numeric file ID, not path
+                file_path = file_info["path"]  # Keep path for logging
+                nextcloud_file_ids.add(file_id)

                # Use last_modified timestamp if available, otherwise use current time
                modified_at = file_info.get("last_modified_timestamp", int(time.time()))
@@ -372,22 +374,23 @@ async def scan_user_documents(
                    await send_stream.send(
                        DocumentTask(
                            user_id=user_id,
-                            doc_id=file_path,
+                            doc_id=file_id,  # Use numeric file ID
                            doc_type="file",
                            operation="index",
                            modified_at=modified_at,
+                            file_path=file_path,  # Pass file path for content retrieval
                        )
                    )
                    file_queued += 1
                else:
                    # Incremental sync: compare with indexed state
-                    indexed_at = indexed_files.get(file_path)
+                    indexed_at = indexed_files.get(file_id)

                    # If file reappeared, remove from potentially_deleted
-                    file_key = (user_id, file_path)
+                    file_key = (user_id, file_id)
                    if file_key in _potentially_deleted:
                        logger.debug(
-                            f"File {file_path} reappeared, removing from deletion grace period"
+                            f"File {file_path} (ID: {file_id}) reappeared, removing from deletion grace period"
                        )
                        del _potentially_deleted[file_key]

@@ -396,10 +399,11 @@ async def scan_user_documents(
                        await send_stream.send(
                            DocumentTask(
                                user_id=user_id,
-                                doc_id=file_path,
+                                doc_id=file_id,  # Use numeric file ID
                                doc_type="file",
                                operation="index",
                                modified_at=modified_at,
+                                file_path=file_path,  # Pass file path for content retrieval
                            )
                        )
                        file_queued += 1
@@ -411,9 +415,9 @@ async def scan_user_documents(

            # Check for deleted files (not initial sync)
            if not initial_sync:
-                for file_path in indexed_files:
-                    if file_path not in nextcloud_file_paths:
-                        file_key = (user_id, file_path)
+                for file_id in indexed_files:
+                    if file_id not in nextcloud_file_ids:
+                        file_key = (user_id, file_id)

                        if file_key in _potentially_deleted:
                            # Check if grace period elapsed
@@ -423,13 +427,13 @@ async def scan_user_documents(
                            if time_missing >= grace_period:
                                # Grace period elapsed, send for deletion
                                logger.info(
-                                    f"File {file_path} missing for {time_missing:.1f}s "
+                                    f"File ID {file_id} missing for {time_missing:.1f}s "
                                    f"(>{grace_period:.1f}s grace period), sending deletion"
                                )
                                await send_stream.send(
                                    DocumentTask(
                                        user_id=user_id,
-                                        doc_id=file_path,
+                                        doc_id=file_id,  # Use numeric file ID
                                        doc_type="file",
                                        operation="delete",
                                        modified_at=0,
@@ -440,7 +444,7 @@ async def scan_user_documents(
                        else:
                            # First time missing, add to grace period tracking
                            logger.debug(
-                                f"File {file_path} missing for first time, starting grace period"
+                                f"File ID {file_id} missing for first time, starting grace period"
                            )
                            _potentially_deleted[file_key] = current_time