feat: Switch files to use numeric IDs with file_path resolution
- scanner.py: Use file_info['id'] as doc_id instead of file_path - scanner.py: Pass file_path in DocumentTask for content retrieval - processor.py: Store file_path in Qdrant payload for later lookup - context.py: Add _get_file_path_from_qdrant() to resolve file_id → file_path - context.py: Update get_chunk_with_context() to handle file ID resolution This makes the system resilient to file renames since file IDs are stable identifiers in Nextcloud, while file paths can change.
This commit is contained in:
@@ -12,6 +12,68 @@ from nextcloud_mcp_server.client import NextcloudClient
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _get_file_path_from_qdrant(
|
||||
user_id: str, file_id: int, chunk_start: int, chunk_end: int
|
||||
) -> str | None:
|
||||
"""Resolve file_id to file_path by querying Qdrant payload.
|
||||
|
||||
Args:
|
||||
user_id: User ID who owns the file
|
||||
file_id: Numeric file ID
|
||||
chunk_start: Character offset where chunk starts
|
||||
chunk_end: Character offset where chunk ends
|
||||
|
||||
Returns:
|
||||
File path string, or None if not found in Qdrant
|
||||
"""
|
||||
try:
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
|
||||
from nextcloud_mcp_server.config import get_settings
|
||||
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
||||
|
||||
qdrant_client = await get_qdrant_client()
|
||||
settings = get_settings()
|
||||
|
||||
# Query for the specific chunk
|
||||
scroll_result = await qdrant_client.scroll(
|
||||
collection_name=settings.get_collection_name(),
|
||||
scroll_filter=Filter(
|
||||
must=[
|
||||
FieldCondition(key="user_id", match=MatchValue(value=user_id)),
|
||||
FieldCondition(key="doc_id", match=MatchValue(value=file_id)),
|
||||
FieldCondition(key="doc_type", match=MatchValue(value="file")),
|
||||
FieldCondition(
|
||||
key="chunk_start_offset", match=MatchValue(value=chunk_start)
|
||||
),
|
||||
FieldCondition(
|
||||
key="chunk_end_offset", match=MatchValue(value=chunk_end)
|
||||
),
|
||||
]
|
||||
),
|
||||
limit=1,
|
||||
with_payload=["file_path"],
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
if scroll_result[0]:
|
||||
point = scroll_result[0][0]
|
||||
file_path = point.payload.get("file_path")
|
||||
if file_path:
|
||||
logger.debug(f"Resolved file_id {file_id} to file_path {file_path}")
|
||||
return str(file_path)
|
||||
|
||||
logger.warning(
|
||||
f"Could not find file_path in Qdrant for file_id {file_id}, "
|
||||
f"chunk [{chunk_start}:{chunk_end}]"
|
||||
)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error querying Qdrant for file_path: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkContext:
|
||||
"""Expanded chunk with surrounding context and position markers.
|
||||
@@ -64,7 +126,7 @@ async def get_chunk_with_context(
|
||||
Args:
|
||||
nc_client: Authenticated Nextcloud client
|
||||
user_id: User ID who owns the document
|
||||
doc_id: Document ID (note ID or file path)
|
||||
doc_id: Document ID (int for notes/files)
|
||||
doc_type: Type of document ("note", "file", etc.)
|
||||
chunk_start: Character offset where chunk starts
|
||||
chunk_end: Character offset where chunk ends
|
||||
@@ -77,8 +139,22 @@ async def get_chunk_with_context(
|
||||
ChunkContext with expanded context and markers, or None if document
|
||||
cannot be retrieved
|
||||
"""
|
||||
# For files, retrieve file_path from Qdrant payload
|
||||
resolved_doc_id = doc_id
|
||||
if doc_type == "file" and isinstance(doc_id, int):
|
||||
file_path = await _get_file_path_from_qdrant(
|
||||
user_id, doc_id, chunk_start, chunk_end
|
||||
)
|
||||
if not file_path:
|
||||
logger.warning(
|
||||
f"Could not resolve file_id {doc_id} to file_path from Qdrant"
|
||||
)
|
||||
return None
|
||||
resolved_doc_id = file_path
|
||||
logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}")
|
||||
|
||||
# Fetch full document text
|
||||
full_text = await _fetch_document_text(nc_client, doc_id, doc_type)
|
||||
full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type)
|
||||
if full_text is None:
|
||||
logger.warning(
|
||||
f"Could not fetch document text for {doc_type} {doc_id}, "
|
||||
|
||||
@@ -261,9 +261,14 @@ async def _index_document(
|
||||
title = document["title"]
|
||||
etag = document.get("etag", "")
|
||||
file_metadata = {} # No file-specific metadata for notes
|
||||
file_path = None # Notes don't have file paths
|
||||
elif doc_task.doc_type == "file":
|
||||
# For files, doc_id is the file path
|
||||
file_path = doc_task.doc_id
|
||||
# For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
|
||||
if not doc_task.file_path:
|
||||
raise ValueError(
|
||||
f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
|
||||
)
|
||||
file_path = doc_task.file_path
|
||||
|
||||
# Read file content via WebDAV
|
||||
content_bytes, content_type = await nc_client.webdav.read_file(file_path)
|
||||
@@ -347,7 +352,7 @@ async def _index_document(
|
||||
# File-specific metadata (PDF, etc.)
|
||||
**(
|
||||
{
|
||||
"file_path": doc_task.doc_id,
|
||||
"file_path": file_path, # Store file path for retrieval
|
||||
"mime_type": file_metadata.get("content_type", ""),
|
||||
"file_size": file_metadata.get("file_size"),
|
||||
"page_number": chunk.page_number,
|
||||
|
||||
@@ -27,10 +27,11 @@ class DocumentTask:
|
||||
"""Document task for processing queue."""
|
||||
|
||||
user_id: str
|
||||
doc_id: str
|
||||
doc_id: int | str # int for files/notes, str for legacy
|
||||
doc_type: str # "note", "file", "calendar"
|
||||
operation: str # "index" or "delete"
|
||||
modified_at: int
|
||||
file_path: str | None = None # File path for files (when doc_id is file_id)
|
||||
|
||||
|
||||
# Track documents potentially deleted (grace period before actual deletion)
|
||||
@@ -337,7 +338,7 @@ async def scan_user_documents(
|
||||
# Scan for tagged PDF files
|
||||
file_count = 0
|
||||
file_queued = 0
|
||||
nextcloud_file_paths = set()
|
||||
nextcloud_file_ids = set()
|
||||
|
||||
try:
|
||||
# Find files with vector-index tag using OCS Tags API
|
||||
@@ -352,8 +353,9 @@ async def scan_user_documents(
|
||||
for file_info in tagged_files:
|
||||
# Files are already filtered by MIME type in find_files_by_tag()
|
||||
file_count += 1
|
||||
file_path = file_info["path"]
|
||||
nextcloud_file_paths.add(file_path)
|
||||
file_id = file_info["id"] # Use numeric file ID, not path
|
||||
file_path = file_info["path"] # Keep path for logging
|
||||
nextcloud_file_ids.add(file_id)
|
||||
|
||||
# Use last_modified timestamp if available, otherwise use current time
|
||||
modified_at = file_info.get("last_modified_timestamp", int(time.time()))
|
||||
@@ -372,22 +374,23 @@ async def scan_user_documents(
|
||||
await send_stream.send(
|
||||
DocumentTask(
|
||||
user_id=user_id,
|
||||
doc_id=file_path,
|
||||
doc_id=file_id, # Use numeric file ID
|
||||
doc_type="file",
|
||||
operation="index",
|
||||
modified_at=modified_at,
|
||||
file_path=file_path, # Pass file path for content retrieval
|
||||
)
|
||||
)
|
||||
file_queued += 1
|
||||
else:
|
||||
# Incremental sync: compare with indexed state
|
||||
indexed_at = indexed_files.get(file_path)
|
||||
indexed_at = indexed_files.get(file_id)
|
||||
|
||||
# If file reappeared, remove from potentially_deleted
|
||||
file_key = (user_id, file_path)
|
||||
file_key = (user_id, file_id)
|
||||
if file_key in _potentially_deleted:
|
||||
logger.debug(
|
||||
f"File {file_path} reappeared, removing from deletion grace period"
|
||||
f"File {file_path} (ID: {file_id}) reappeared, removing from deletion grace period"
|
||||
)
|
||||
del _potentially_deleted[file_key]
|
||||
|
||||
@@ -396,10 +399,11 @@ async def scan_user_documents(
|
||||
await send_stream.send(
|
||||
DocumentTask(
|
||||
user_id=user_id,
|
||||
doc_id=file_path,
|
||||
doc_id=file_id, # Use numeric file ID
|
||||
doc_type="file",
|
||||
operation="index",
|
||||
modified_at=modified_at,
|
||||
file_path=file_path, # Pass file path for content retrieval
|
||||
)
|
||||
)
|
||||
file_queued += 1
|
||||
@@ -411,9 +415,9 @@ async def scan_user_documents(
|
||||
|
||||
# Check for deleted files (not initial sync)
|
||||
if not initial_sync:
|
||||
for file_path in indexed_files:
|
||||
if file_path not in nextcloud_file_paths:
|
||||
file_key = (user_id, file_path)
|
||||
for file_id in indexed_files:
|
||||
if file_id not in nextcloud_file_ids:
|
||||
file_key = (user_id, file_id)
|
||||
|
||||
if file_key in _potentially_deleted:
|
||||
# Check if grace period elapsed
|
||||
@@ -423,13 +427,13 @@ async def scan_user_documents(
|
||||
if time_missing >= grace_period:
|
||||
# Grace period elapsed, send for deletion
|
||||
logger.info(
|
||||
f"File {file_path} missing for {time_missing:.1f}s "
|
||||
f"File ID {file_id} missing for {time_missing:.1f}s "
|
||||
f"(>{grace_period:.1f}s grace period), sending deletion"
|
||||
)
|
||||
await send_stream.send(
|
||||
DocumentTask(
|
||||
user_id=user_id,
|
||||
doc_id=file_path,
|
||||
doc_id=file_id, # Use numeric file ID
|
||||
doc_type="file",
|
||||
operation="delete",
|
||||
modified_at=0,
|
||||
@@ -440,7 +444,7 @@ async def scan_user_documents(
|
||||
else:
|
||||
# First time missing, add to grace period tracking
|
||||
logger.debug(
|
||||
f"File {file_path} missing for first time, starting grace period"
|
||||
f"File ID {file_id} missing for first time, starting grace period"
|
||||
)
|
||||
_potentially_deleted[file_key] = current_time
|
||||
|
||||
|
||||
Reference in New Issue
Block a user