Files
nextcloud-mcp-server/nextcloud_mcp_server/vector/scanner.py
T
Chris Coutinho 4dbb2eb468 fix: integrate vector sync tasks with Starlette lifespan for streamable-http
Fixes background task startup for streamable-http transport by integrating
vector sync initialization into the Starlette lifespan context manager.

Starlette Lifespan Integration:
- Moved background task startup from FastMCP lifespan to Starlette lifespan
- FastMCP lifespan only triggers on MCP session establishment
- Starlette lifespan runs on server startup (correct timing)
- Fixed module scoping issues with local imports (anyio_module, asyncio_module)
- Added conditional startup based on oauth_enabled flag

Scanner Fixes:
- Fixed NotesClient method: list_notes() → get_all_notes()
- Properly handle AsyncIterator with list comprehension
- Collects all notes before processing

Verified Working:
- Background tasks start successfully on server startup
- Scanner fetches notes from Nextcloud API
- Processor pool (3 workers) ready for document processing
- Health endpoint reports Qdrant status
- No startup errors

Phase 3 Complete:
- BasicAuth mode with vector sync fully functional
- Background tasks integrate cleanly with streamable-http transport
- Graceful shutdown with coordinated task cancellation

Related: ADR-007 Background Vector Database Synchronization

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 21:20:26 +01:00

173 lines
5.2 KiB
Python

"""Scanner task for vector database synchronization.
Periodically scans enabled users' content and queues changed documents for processing.
"""
import asyncio
import logging
from dataclasses import dataclass
import anyio
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
@dataclass
class DocumentTask:
"""Document task for processing queue."""
user_id: str
doc_id: str
doc_type: str # "note", "file", "calendar"
operation: str # "index" or "delete"
modified_at: int
async def scanner_task(
document_queue: asyncio.Queue,
shutdown_event: anyio.Event,
wake_event: anyio.Event,
nc_client: NextcloudClient,
user_id: str,
):
"""
Periodic scanner that detects changed documents for enabled user.
For BasicAuth mode, scans a single user with credentials available at runtime.
Args:
document_queue: Queue to enqueue changed documents
shutdown_event: Event signaling shutdown
wake_event: Event to trigger immediate scan
nc_client: Authenticated Nextcloud client
user_id: User to scan
"""
logger.info(f"Scanner task started for user: {user_id}")
settings = get_settings()
while not shutdown_event.is_set():
try:
# Scan user documents
await scan_user_documents(
user_id=user_id,
document_queue=document_queue,
nc_client=nc_client,
)
except Exception as e:
logger.error(f"Scanner error: {e}", exc_info=True)
# Sleep until next interval or wake event
try:
with anyio.move_on_after(settings.vector_sync_scan_interval):
# Wait for wake event or shutdown (whichever comes first)
await wake_event.wait()
except anyio.get_cancelled_exc_class():
# Shutdown, exit loop
break
logger.info("Scanner task stopped")
async def scan_user_documents(
user_id: str,
document_queue: asyncio.Queue,
nc_client: NextcloudClient,
initial_sync: bool = False,
):
"""
Scan a single user's documents and queue changes.
Args:
user_id: User to scan
document_queue: Queue to enqueue changed documents
nc_client: Authenticated Nextcloud client
initial_sync: If True, queue all documents (first-time sync)
"""
logger.info(f"Scanning documents for user: {user_id}")
# Fetch all notes from Nextcloud
notes = [note async for note in nc_client.notes.get_all_notes()]
logger.debug(f"Found {len(notes)} notes for {user_id}")
if initial_sync:
# Queue everything on first sync
for note in notes:
await document_queue.put(
DocumentTask(
user_id=user_id,
doc_id=str(note["id"]),
doc_type="note",
operation="index",
modified_at=note["modified"],
)
)
logger.info(f"Queued {len(notes)} documents for initial sync: {user_id}")
return
# Get indexed state from Qdrant
qdrant_client = await get_qdrant_client()
scroll_result = await qdrant_client.scroll(
collection_name=get_settings().qdrant_collection,
scroll_filter=Filter(
must=[
FieldCondition(key="user_id", match=MatchValue(value=user_id)),
FieldCondition(key="doc_type", match=MatchValue(value="note")),
]
),
with_payload=["doc_id", "indexed_at"],
with_vectors=False,
limit=10000,
)
indexed_docs = {
point.payload["doc_id"]: point.payload["indexed_at"]
for point in scroll_result[0]
}
logger.debug(f"Found {len(indexed_docs)} indexed documents in Qdrant")
# Compare and queue changes
queued = 0
for note in notes:
doc_id = str(note["id"])
indexed_at = indexed_docs.get(doc_id)
# Queue if never indexed or modified since last index
if indexed_at is None or note["modified"] > indexed_at:
await document_queue.put(
DocumentTask(
user_id=user_id,
doc_id=doc_id,
doc_type="note",
operation="index",
modified_at=note["modified"],
)
)
queued += 1
# Check for deleted documents (in Qdrant but not in Nextcloud)
nextcloud_doc_ids = {str(note["id"]) for note in notes}
for doc_id in indexed_docs:
if doc_id not in nextcloud_doc_ids:
await document_queue.put(
DocumentTask(
user_id=user_id,
doc_id=doc_id,
doc_type="note",
operation="delete",
modified_at=0,
)
)
queued += 1
if queued > 0:
logger.info(f"Queued {queued} documents for incremental sync: {user_id}")
else:
logger.debug(f"No changes detected for {user_id}")