feat: Switch files to use numeric IDs with file_path resolution

- scanner.py: Use file_info['id'] as doc_id instead of file_path
- scanner.py: Pass file_path in DocumentTask for content retrieval
- processor.py: Store file_path in Qdrant payload for later lookup
- context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path
- context.py: Update get_chunk_with_context() to handle file ID resolution

This makes the system resilient to file renames since file IDs are stable
identifiers in Nextcloud, while file paths can change.
This commit is contained in:
Chris Coutinho
2025-11-20 12:00:47 +01:00
parent f1610bbd2e
commit d0691d5aa0
3 changed files with 105 additions and 20 deletions
+8 -3
View File
@@ -261,9 +261,14 @@ async def _index_document(
title = document["title"]
etag = document.get("etag", "")
file_metadata = {} # No file-specific metadata for notes
file_path = None # Notes don't have file paths
elif doc_task.doc_type == "file":
# For files, doc_id is the file path
file_path = doc_task.doc_id
# For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
if not doc_task.file_path:
raise ValueError(
f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
)
file_path = doc_task.file_path
# Read file content via WebDAV
content_bytes, content_type = await nc_client.webdav.read_file(file_path)
@@ -347,7 +352,7 @@ async def _index_document(
# File-specific metadata (PDF, etc.)
**(
{
"file_path": doc_task.doc_id,
"file_path": file_path, # Store file path for retrieval
"mime_type": file_metadata.get("content_type", ""),
"file_size": file_metadata.get("file_size"),
"page_number": chunk.page_number,