Compare commits

...

2 Commits

Author SHA1 Message Date
Chris Coutinho 54fdc8addc Merge remote-tracking branch 'origin/master' into feat/deck-vector-search 2025-12-14 00:23:16 +01:00
Chris Coutinho e0320e761c perf(deck): optimize card lookup by storing board_id/stack_id in metadata
Addresses reviewer feedback on PR #395 about O(n²) performance issue.

Changes:
- scanner.py: Add metadata field to DocumentTask with board_id/stack_id
- scanner.py: Populate metadata during deck card scanning (both initial and incremental sync)
- processor.py: Use metadata for O(1) card lookup via get_card() API when available
- processor.py: Fallback to iteration for legacy data without metadata
- context.py: Add _get_deck_metadata_from_qdrant() helper to retrieve metadata from Qdrant
- context.py: Use metadata for fast path lookup in chunk context expansion
- context.py: Add user_id parameter to _fetch_document_text() for metadata retrieval

Performance Impact:
- Before: O(boards × stacks × cards) iteration for each card lookup
- After: O(1) direct API call using stored board_id/stack_id
- Graceful degradation: Falls back to iteration for legacy data

Testing:
- All existing integration tests pass (test_deck_vector_search.py)
- Type checking passes with no new errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-14 00:23:12 +01:00
3 changed files with 209 additions and 84 deletions
+120 -33
View File
@@ -209,6 +209,64 @@ async def _get_file_path_from_qdrant(
return None
async def _get_deck_metadata_from_qdrant(
user_id: str, card_id: int
) -> dict[str, int] | None:
"""Retrieve board_id and stack_id for a deck card from Qdrant payload.
Args:
user_id: User ID who owns the card
card_id: Card ID
Returns:
Dictionary with board_id and stack_id, or None if not found
"""
try:
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
qdrant_client = await get_qdrant_client()
settings = get_settings()
# Query for any chunk of this card (we just need metadata)
scroll_result = await qdrant_client.scroll(
collection_name=settings.get_collection_name(),
scroll_filter=Filter(
must=[
FieldCondition(key="user_id", match=MatchValue(value=user_id)),
FieldCondition(key="doc_id", match=MatchValue(value=card_id)),
FieldCondition(key="doc_type", match=MatchValue(value="deck_card")),
]
),
limit=1,
with_payload=["board_id", "stack_id"],
with_vectors=False,
)
if scroll_result[0]:
point = scroll_result[0][0]
board_id = point.payload.get("board_id")
stack_id = point.payload.get("stack_id")
if board_id is not None and stack_id is not None:
logger.debug(
f"Retrieved deck metadata for card {card_id}: "
f"board_id={board_id}, stack_id={stack_id}"
)
return {"board_id": int(board_id), "stack_id": int(stack_id)}
logger.debug(
f"Could not find deck metadata in Qdrant for card {card_id} "
f"(might be legacy data without board_id/stack_id)"
)
return None
except Exception as e:
logger.debug(f"Error querying Qdrant for deck metadata: {e}")
return None
@dataclass
class ChunkContext:
"""Expanded chunk with surrounding context and position markers.
@@ -394,7 +452,9 @@ async def get_chunk_with_context(
logger.debug(f"Resolved file_id {doc_id} to file_path {file_path}")
# Fetch full document text
full_text = await _fetch_document_text(nc_client, resolved_doc_id, doc_type)
full_text = await _fetch_document_text(
nc_client, resolved_doc_id, doc_type, user_id
)
if full_text is None:
logger.warning(
f"Could not fetch document text for {doc_type} {doc_id}, "
@@ -453,7 +513,7 @@ async def get_chunk_with_context(
async def _fetch_document_text(
nc_client: NextcloudClient, doc_id: str | int, doc_type: str
nc_client: NextcloudClient, doc_id: str | int, doc_type: str, user_id: str
) -> str | None:
"""Fetch full text content of a document.
@@ -546,44 +606,71 @@ async def _fetch_document_text(
return "\n".join(content_parts)
elif doc_type == "deck_card":
# Fetch card from Deck API
# Note: Deck API requires board_id and stack_id, but we don't store those
# We need to search through boards to find the card (same as processor.py)
boards = await nc_client.deck.get_boards()
card_found = False
# Try to get board_id/stack_id from Qdrant metadata (O(1) lookup)
# Otherwise fall back to iteration (legacy data)
card = None
deck_metadata = await _get_deck_metadata_from_qdrant(user_id, int(doc_id))
for board in boards:
if card_found:
break
# Skip deleted boards (soft delete: deletedAt > 0)
if board.deletedAt > 0:
logger.debug(
f"Skipping deleted board {board.id} while searching for card {doc_id}"
if deck_metadata:
# Fast path: Direct lookup with known board_id/stack_id
board_id = deck_metadata["board_id"]
stack_id = deck_metadata["stack_id"]
try:
card = await nc_client.deck.get_card(
board_id=board_id, stack_id=stack_id, card_id=int(doc_id)
)
logger.debug(
f"Retrieved deck card {doc_id} using metadata "
f"(board_id={board_id}, stack_id={stack_id})"
)
except Exception as e:
logger.warning(
f"Failed to fetch card with metadata (board_id={board_id}, "
f"stack_id={stack_id}, card_id={doc_id}): {e}, falling back to iteration"
)
continue
stacks = await nc_client.deck.get_stacks(board.id)
# Fallback: Iterate through all boards/stacks (for legacy data or if fast path failed)
if card is None:
boards = await nc_client.deck.get_boards()
card_found = False
for stack in stacks:
for board in boards:
if card_found:
break
if stack.cards:
for card in stack.cards:
if card.id == int(doc_id):
# Reconstruct full content as indexed: title + "\n\n" + description
# This ensures chunk offsets align with indexed content structure
content_parts = [card.title]
if card.description:
content_parts.append(card.description)
card_found = True
logger.debug(
f"Found deck card {doc_id} in board {board.id}, stack {stack.id}"
)
return "\n\n".join(content_parts)
# Card not found (might be archived or deleted)
logger.warning(f"Deck card {doc_id} not found in any board/stack")
return None
# Skip deleted boards (soft delete: deletedAt > 0)
if board.deletedAt > 0:
logger.debug(
f"Skipping deleted board {board.id} while searching for card {doc_id}"
)
continue
stacks = await nc_client.deck.get_stacks(board.id)
for stack in stacks:
if card_found:
break
if stack.cards:
for c in stack.cards:
if c.id == int(doc_id):
card = c
card_found = True
logger.debug(
f"Found deck card {doc_id} in board {board.id}, "
f"stack {stack.id} (fallback iteration)"
)
break
if not card_found:
logger.warning(f"Deck card {doc_id} not found in any board/stack")
return None
# Reconstruct full content as indexed: title + "\n\n" + description
# This ensures chunk offsets align with indexed content structure
content_parts = [card.title]
if card.description:
content_parts.append(card.description)
return "\n\n".join(content_parts)
else:
logger.warning(f"Unsupported doc_type for context expansion: {doc_type}")
return None
+84 -51
View File
@@ -314,62 +314,95 @@ async def _index_document(
content_type = None
elif doc_task.doc_type == "deck_card":
# Fetch card from Deck API
# Note: We need board_id and stack_id to fetch the card
# For now, we'll need to get all boards and find the card
# This is not optimal, but Deck API requires board_id and stack_id
boards = await nc_client.deck.get_boards()
card_found = False
# Use metadata from scanner if available (O(1) lookup)
# Otherwise fall back to iteration (legacy data)
card = None
board = None
stack = None
for board in boards:
if card_found:
break
# Skip deleted boards (soft delete: deletedAt > 0)
if board.deletedAt > 0:
continue
stacks = await nc_client.deck.get_stacks(board.id)
for stack in stacks:
if (
doc_task.metadata
and "board_id" in doc_task.metadata
and "stack_id" in doc_task.metadata
):
# Fast path: Direct lookup with known board_id/stack_id
board_id = doc_task.metadata["board_id"]
stack_id = doc_task.metadata["stack_id"]
try:
card = await nc_client.deck.get_card(
board_id=int(board_id),
stack_id=int(stack_id),
card_id=int(doc_task.doc_id),
)
# Fetch board and stack info for metadata
boards = await nc_client.deck.get_boards()
for b in boards:
if b.id == int(board_id):
board = b
stacks = await nc_client.deck.get_stacks(b.id)
for s in stacks:
if s.id == int(stack_id):
stack = s
break
break
except Exception as e:
logger.warning(
f"Failed to fetch card with metadata (board_id={board_id}, stack_id={stack_id}, card_id={doc_task.doc_id}): {e}, falling back to iteration"
)
# Fallback: Iterate through all boards/stacks (for legacy data or if fast path failed)
if card is None:
boards = await nc_client.deck.get_boards()
card_found = False
for b in boards:
if card_found:
break
if stack.cards:
for card in stack.cards:
if card.id == int(doc_task.doc_id):
# Build content from card title and description
content_parts = [card.title]
if card.description:
content_parts.append(card.description)
content = "\n\n".join(content_parts)
title = card.title
# Skip deleted boards (soft delete: deletedAt > 0)
if b.deletedAt > 0:
continue
stacks = await nc_client.deck.get_stacks(b.id)
for s in stacks:
if card_found:
break
if s.cards:
for c in s.cards:
if c.id == int(doc_task.doc_id):
card = c
board = b
stack = s
card_found = True
break
# Store deck-specific metadata
file_metadata = {
"board_id": board.id,
"board_title": board.title,
"stack_id": stack.id,
"stack_title": stack.title,
"card_type": card.type,
"duedate": (
card.duedate.isoformat()
if card.duedate
else None
),
"archived": card.archived,
"owner": (
card.owner.uid
if hasattr(card.owner, "uid")
else str(card.owner)
),
}
etag = card.etag or ""
file_path = None
content_bytes = None
content_type = None
card_found = True
break
if not card_found:
raise ValueError(
f"Deck card {doc_task.doc_id} not found in any board/stack"
)
if not card_found:
raise ValueError(
f"Deck card {doc_task.doc_id} not found in any board/stack"
)
# Build content from card title and description
content_parts = [card.title]
if card.description:
content_parts.append(card.description)
content = "\n\n".join(content_parts)
title = card.title
# Store deck-specific metadata
file_metadata = {
"board_id": board.id,
"board_title": board.title,
"stack_id": stack.id,
"stack_title": stack.title,
"card_type": card.type,
"duedate": (card.duedate.isoformat() if card.duedate else None),
"archived": card.archived,
"owner": (
card.owner.uid if hasattr(card.owner, "uid") else str(card.owner)
),
}
etag = card.etag or ""
file_path = None
content_bytes = None
content_type = None
elif doc_task.doc_type == "file":
# For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
if not doc_task.file_path:
+5
View File
@@ -36,6 +36,9 @@ class DocumentTask:
operation: str # "index" or "delete"
modified_at: int
file_path: str | None = None # File path for files (when doc_id is file_id)
metadata: dict[str, int | str] | None = (
None # Additional metadata (e.g., board_id/stack_id for deck_card)
)
# Track documents potentially deleted (grace period before actual deletion)
@@ -874,6 +877,7 @@ async def scan_deck_cards(
doc_type="deck_card",
operation="index",
modified_at=modified_at,
metadata={"board_id": board.id, "stack_id": stack.id},
)
)
queued += 1
@@ -921,6 +925,7 @@ async def scan_deck_cards(
doc_type="deck_card",
operation="index",
modified_at=modified_at,
metadata={"board_id": board.id, "stack_id": stack.id},
)
)
queued += 1