feat: Switch files to use numeric IDs with file_path resolution

- scanner.py: Use file_info['id'] as doc_id instead of file_path
- scanner.py: Pass file_path in DocumentTask for content retrieval
- processor.py: Store file_path in Qdrant payload for later lookup
- context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path
- context.py: Update get_chunk_with_context() to handle file ID resolution

This makes the system resilient to file renames since file IDs are stable
identifiers in Nextcloud, while file paths can change.
This commit is contained in:
Chris Coutinho
2025-11-20 12:00:47 +01:00
parent f1610bbd2e
commit d0691d5aa0
3 changed files with 105 additions and 20 deletions
+78 -2
View File
@@ -12,6 +12,68 @@ from nextcloud_mcp_server.client import NextcloudClient
logger = logging.getLogger(__name__)
async def _get_file_path_from_qdrant(
user_id: str, file_id: int, chunk_start: int, chunk_end: int
) -> str | None:
"""Resolve file_id to file_path by querying Qdrant payload.
Args:
user_id: User ID who owns the file
file_id: Numeric file ID
chunk_start: Character offset where chunk starts
chunk_end: Character offset where chunk ends
Returns:
File path string, or None if not found in Qdrant
"""
try:
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
qdrant_client = await get_qdrant_client()
settings = get_settings()
# Query for the specific chunk
scroll_result = await qdrant_client.scroll(
collection_name=settings.get_collection_name(),
scroll_filter=Filter(
must=[
FieldCondition(key="user_id", match=MatchValue(value=user_id)),
FieldCondition(key="doc_id", match=MatchValue(value=file_id)),
FieldCondition(key="doc_type", match=MatchValue(value="file")),
FieldCondition(
key="chunk_start_offset", match=MatchValue(value=chunk_start)
),
FieldCondition(
key="chunk_end_offset", match=MatchValue(value=chunk_end)
),
]
),
limit=1,
with_payload=["file_path"],
with_vectors=False,
)
if scroll_result[0]:
point = scroll_result[0][0]
file_path = point.payload.get("file_path")
if file_path:
logger.debug(f"Resolved file_id {file_id} to file_path {file_path}")
return str(file_path)
logger.warning(
f"Could not find file_path in Qdrant for file_id {file_id}, "
f"chunk [{chunk_start}:{chunk_end}]"
)
return None
except Exception as e:
logger.error(f"Error querying Qdrant for file_path: {e}", exc_info=True)
return None
@dataclass
class ChunkContext:
"""Expanded chunk with surrounding context and position markers.
@@ -64,7 +126,7 @@ async def get_chunk_with_context(
Args:
nc_client: Authenticated Nextcloud client
user_id: User ID who owns the document
doc_id: Document ID (note ID or file path)
doc_id: Document ID (int for notes/files)
doc_type: Type of document ("note", "file", etc.)
chunk_start: Character offset where chunk starts
chunk_end: Character offset where chunk ends
@@ -77,8 +139,22 @@ async def get_chunk_with_context(
ChunkContext with expanded context and markers, or None if document
cannot be retrieved
"""
# For files, retrieve file_path from Qdrant payload
resolved_doc_id = doc_id
if doc_type == "file" and isinstance(doc_id, int):
file_path = await _get_file_path_from_qdrant(
user_id, doc_id, chunk_start, chunk_end
)
if not file_path:
logger.warning(
f"Could not resolve file_id {doc_id} to file_path from Qdrant"
)
return None
resolved_doc_id = file_path
logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}")
# Fetch full document text
full_text = await _fetch_document_text(nc_client, doc_id, doc_type)
full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type)
if full_text is None:
logger.warning(
f"Could not fetch document text for {doc_type} {doc_id}, "
+8 -3
View File
@@ -261,9 +261,14 @@ async def _index_document(
title = document["title"]
etag = document.get("etag", "")
file_metadata = {} # No file-specific metadata for notes
file_path = None # Notes don't have file paths
elif doc_task.doc_type == "file":
# For files, doc_id is the file path
file_path = doc_task.doc_id
# For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
if not doc_task.file_path:
raise ValueError(
f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
)
file_path = doc_task.file_path
# Read file content via WebDAV
content_bytes, content_type = await nc_client.webdav.read_file(file_path)
@@ -347,7 +352,7 @@ async def _index_document(
# File-specific metadata (PDF, etc.)
**(
{
"file_path": doc_task.doc_id,
"file_path": file_path, # Store file path for retrieval
"mime_type": file_metadata.get("content_type", ""),
"file_size": file_metadata.get("file_size"),
"page_number": chunk.page_number,
+19 -15
View File
@@ -27,10 +27,11 @@ class DocumentTask:
"""Document task for processing queue."""
user_id: str
doc_id: str
doc_id: int | str # int for files/notes, str for legacy
doc_type: str # "note", "file", "calendar"
operation: str # "index" or "delete"
modified_at: int
file_path: str | None = None # File path for files (when doc_id is file_id)
# Track documents potentially deleted (grace period before actual deletion)
@@ -337,7 +338,7 @@ async def scan_user_documents(
# Scan for tagged PDF files
file_count = 0
file_queued = 0
nextcloud_file_paths = set()
nextcloud_file_ids = set()
try:
# Find files with vector-index tag using OCS Tags API
@@ -352,8 +353,9 @@ async def scan_user_documents(
for file_info in tagged_files:
# Files are already filtered by MIME type in find_files_by_tag()
file_count += 1
file_path = file_info["path"]
nextcloud_file_paths.add(file_path)
file_id = file_info["id"] # Use numeric file ID, not path
file_path = file_info["path"] # Keep path for logging
nextcloud_file_ids.add(file_id)
# Use last_modified timestamp if available, otherwise use current time
modified_at = file_info.get("last_modified_timestamp", int(time.time()))
@@ -372,22 +374,23 @@ async def scan_user_documents(
await send_stream.send(
DocumentTask(
user_id=user_id,
doc_id=file_path,
doc_id=file_id, # Use numeric file ID
doc_type="file",
operation="index",
modified_at=modified_at,
file_path=file_path, # Pass file path for content retrieval
)
)
file_queued += 1
else:
# Incremental sync: compare with indexed state
indexed_at = indexed_files.get(file_path)
indexed_at = indexed_files.get(file_id)
# If file reappeared, remove from potentially_deleted
file_key = (user_id, file_path)
file_key = (user_id, file_id)
if file_key in _potentially_deleted:
logger.debug(
f"File {file_path} reappeared, removing from deletion grace period"
f"File {file_path} (ID: {file_id}) reappeared, removing from deletion grace period"
)
del _potentially_deleted[file_key]
@@ -396,10 +399,11 @@ async def scan_user_documents(
await send_stream.send(
DocumentTask(
user_id=user_id,
doc_id=file_path,
doc_id=file_id, # Use numeric file ID
doc_type="file",
operation="index",
modified_at=modified_at,
file_path=file_path, # Pass file path for content retrieval
)
)
file_queued += 1
@@ -411,9 +415,9 @@ async def scan_user_documents(
# Check for deleted files (not initial sync)
if not initial_sync:
for file_path in indexed_files:
if file_path not in nextcloud_file_paths:
file_key = (user_id, file_path)
for file_id in indexed_files:
if file_id not in nextcloud_file_ids:
file_key = (user_id, file_id)
if file_key in _potentially_deleted:
# Check if grace period elapsed
@@ -423,13 +427,13 @@ async def scan_user_documents(
if time_missing >= grace_period:
# Grace period elapsed, send for deletion
logger.info(
f"File {file_path} missing for {time_missing:.1f}s "
f"File ID {file_id} missing for {time_missing:.1f}s "
f"(>{grace_period:.1f}s grace period), sending deletion"
)
await send_stream.send(
DocumentTask(
user_id=user_id,
doc_id=file_path,
doc_id=file_id, # Use numeric file ID
doc_type="file",
operation="delete",
modified_at=0,
@@ -440,7 +444,7 @@ async def scan_user_documents(
else:
# First time missing, add to grace period tracking
logger.debug(
f"File {file_path} missing for first time, starting grace period"
f"File ID {file_id} missing for first time, starting grace period"
)
_potentially_deleted[file_key] = current_time