From 00e72d24a620c3389c3a5aa21070a7e1d945ad44 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Fri, 14 Nov 2025 19:20:30 +0100 Subject: [PATCH 01/17] feat: Enable SSE transport for mcp service and update test fixtures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: - Remove streamable-http transport override from mcp service in docker-compose.yml - Service now uses CLI default SSE transport on /sse endpoint - Add create_mcp_client_session_sse() helper for SSE connections - Update nc_mcp_client fixture to use SSE transport - Fix unpacking for SSE client (yields 2 values vs 3 for streamable-http) Testing: - All 4 smoke tests pass with SSE transport - 32/34 affected tests pass (2 skipped for vector sync) - OAuth services remain on streamable-http (unchanged) Note: SSE transport is being deprecated in favor of streamable-http. This enables minimal validation testing before deprecation. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- docker-compose.yml | 1 - tests/conftest.py | 54 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 38f72db..0bc1016 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -69,7 +69,6 @@ services: mcp: build: . - command: ["--transport", "streamable-http"] restart: always depends_on: app: diff --git a/tests/conftest.py b/tests/conftest.py index f7355be..98bb2e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ import pytest from httpx import HTTPStatusError from mcp import ClientSession from mcp.client.session import RequestContext +from mcp.client.sse import sse_client from mcp.client.streamable_http import streamablehttp_client from mcp.types import ElicitRequestParams, ElicitResult, ErrorData @@ -165,6 +166,51 @@ async def create_mcp_client_session( logger.debug(f"{client_name} client session cleaned up successfully") +async def create_mcp_client_session_sse( + url: str, + token: str | None = None, + client_name: str = "MCP", + elicitation_callback: Any = None, +) -> AsyncGenerator[ClientSession, Any]: + """ + Factory function to create an MCP client session using SSE transport. + + Similar to create_mcp_client_session but uses SSE transport instead of streamable-http. + Uses native async context managers to ensure correct LIFO cleanup order. + + Args: + url: MCP server URL (e.g., "http://localhost:8000/sse") + token: Optional OAuth access token for Bearer authentication + client_name: Client name for logging (e.g., "Basic MCP (SSE)") + elicitation_callback: Optional callback for handling elicitation requests + + Yields: + Initialized MCP ClientSession + + Note: + SSE transport is being deprecated in favor of streamable-http. + This function exists for compatibility testing only. + """ + logger.info(f"Creating SSE client for {client_name}") + + # Prepare headers with OAuth token if provided + headers = {"Authorization": f"Bearer {token}"} if token else None + + # Use native async with - Python ensures LIFO cleanup + # Cleanup order will be: ClientSession.__aexit__ -> sse_client.__aexit__ + # Note: sse_client yields only (read_stream, write_stream), not 3 values like streamablehttp_client + async with sse_client(url, headers=headers) as (read_stream, write_stream): + async with ClientSession( + read_stream, write_stream, elicitation_callback=elicitation_callback + ) as session: + await session.initialize() + logger.info(f"{client_name} client session initialized successfully") + yield session + + # Cleanup happens automatically in LIFO order - no exception suppression needed + logger.debug(f"{client_name} client session cleaned up successfully") + + @pytest.fixture(scope="session") async def nc_client(anyio_backend) -> AsyncGenerator[NextcloudClient, Any]: """ @@ -203,12 +249,14 @@ async def nc_client(anyio_backend) -> AsyncGenerator[NextcloudClient, Any]: @pytest.fixture(scope="session") async def nc_mcp_client(anyio_backend) -> AsyncGenerator[ClientSession, Any]: """ - Fixture to create an MCP client session for integration tests using streamable-http. + Fixture to create an MCP client session for integration tests using SSE transport. Uses anyio pytest plugin for proper async fixture handling. + + Note: SSE transport is being deprecated. This fixture uses SSE for compatibility testing. """ - async for session in create_mcp_client_session( - url="http://localhost:8000/mcp", client_name="Basic MCP" + async for session in create_mcp_client_session_sse( + url="http://localhost:8000/sse", client_name="Basic MCP (SSE)" ): yield session From 66a710913078accf19e419ae0beefb54c220332a Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Fri, 14 Nov 2025 23:56:09 +0100 Subject: [PATCH 02/17] docs: Add ADR-012 for unified multi-algorithm search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Proposes unified search architecture with client-configurable algorithm selection and weighting. Addresses the need for flexible search options beyond pure semantic search. Key features: - Four algorithms: semantic, keyword, fuzzy, hybrid - Client-configurable weights for hybrid search - Shared implementation between viz pane and MCP tools - Reciprocal Rank Fusion (RRF) for result combination - Backward compatible with existing nc_semantic_search() Implements designs from: - ADR-003: Hybrid search with RRF (previously unimplemented) - ADR-001: Token-based keyword search (previously unimplemented) Supersedes ADR-011's placeholder for "ADR-013: Hybrid Search" πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../ADR-012-unified-multi-algorithm-search.md | 386 ++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 docs/ADR-012-unified-multi-algorithm-search.md diff --git a/docs/ADR-012-unified-multi-algorithm-search.md b/docs/ADR-012-unified-multi-algorithm-search.md new file mode 100644 index 0000000..a788a6a --- /dev/null +++ b/docs/ADR-012-unified-multi-algorithm-search.md @@ -0,0 +1,386 @@ +# ADR-012: Unified Multi-Algorithm Search with Client-Configurable Weighting + +## Status +Proposed + +## Context + +### Current State + +The Nextcloud MCP server currently provides semantic search via vector similarity (Qdrant), as designed in ADR-003 and implemented through ADR-007. However, users and MCP clients have limited control over search behavior: + +1. **Single algorithm only**: Only pure vector similarity search is available +2. **No algorithm selection**: MCP clients cannot choose between semantic, keyword, or fuzzy approaches +3. **No weighting control**: Clients cannot adjust the balance between different search methods +4. **Disconnected implementations**: Viz pane uses different search algorithms than MCP tools +5. **Limited flexibility**: No way to optimize search for different use cases (exact match vs. conceptual similarity) + +### User Needs + +Different search scenarios require different algorithms: + +- **Exact match queries**: "Find note titled 'Q1 Budget'" β†’ keyword search preferred +- **Conceptual queries**: "What are my goals for next quarter?" β†’ semantic search preferred +- **Typo-tolerant queries**: "Find note about kuberntes" β†’ fuzzy search needed +- **Balanced queries**: "Find documentation about API endpoints" β†’ hybrid search optimal + +Additionally, users need a **testing interface** (viz pane) to: +- Experiment with different search algorithms on their own documents +- Visualize search results and algorithm behavior +- Tune weights for optimal results +- Understand which algorithm works best for their queries + +### Technical Requirements + +1. **Unified interface**: Single MCP tool supporting multiple algorithms +2. **Client control**: MCP clients specify algorithm and weights via tool parameters +3. **Backward compatibility**: Existing `nc_semantic_search()` behavior preserved +4. **Shared implementation**: Viz pane and MCP tools use identical search algorithms +5. **User accessibility**: Viz pane available to all logged-in users with vector sync enabled +6. **Performance**: Minimal overhead for algorithm selection + +## Decision + +We will implement a **unified multi-algorithm search architecture** with the following components: + +### 1. Core Search Algorithms + +Four search algorithms will be available: + +#### a) Semantic Search (Vector Similarity) +- **Method**: Cosine distance in 768-dimensional embedding space +- **Implementation**: Qdrant `query_points` with user_id filtering +- **Use case**: Conceptual queries, finding related content +- **Current status**: Implemented in `nextcloud_mcp_server/server/semantic.py` + +#### b) Keyword Search (Token-Based) +- **Method**: Token matching with weighted scoring (from ADR-001) +- **Implementation**: Title matches weighted 3x higher than content +- **Use case**: Exact phrase matching, known titles +- **Current status**: Designed in ADR-001, not implemented + +#### c) Fuzzy Search (Character Overlap) +- **Method**: Simple character-based similarity (70% threshold) +- **Implementation**: Character set comparison (current viz pane approach) +- **Use case**: Typo tolerance, approximate matching +- **Current status**: Implemented in viz pane only + +#### d) Hybrid Search (Multi-Algorithm Fusion) +- **Method**: Reciprocal Rank Fusion (RRF) from ADR-003 +- **Implementation**: Parallel execution + score combination +- **Use case**: Balanced queries, general-purpose search +- **Current status**: Designed in ADR-003, not implemented + +### 2. Unified MCP Tool Interface + +```python +@mcp.tool() +@require_scopes("semantic:read") +async def nc_semantic_search( + query: str, + ctx: Context, + limit: int = 10, + score_threshold: float = 0.7, + algorithm: Literal["semantic", "keyword", "fuzzy", "hybrid"] = "hybrid", + semantic_weight: float = 0.5, + keyword_weight: float = 0.3, + fuzzy_weight: float = 0.2, +) -> SearchResponse: + """ + Search Nextcloud content using configurable algorithms. + + Args: + query: Natural language search query + ctx: MCP context for authentication + limit: Maximum results to return + score_threshold: Minimum similarity score (semantic/hybrid only) + algorithm: Search algorithm to use + semantic_weight: Weight for semantic results (hybrid only, default: 0.5) + keyword_weight: Weight for keyword results (hybrid only, default: 0.3) + fuzzy_weight: Weight for fuzzy results (hybrid only, default: 0.2) + + Returns: + Ranked search results with scores and excerpts + """ +``` + +**Key decisions**: +- **Single tool name**: Keep `nc_semantic_search` for backward compatibility +- **Algorithm parameter**: Explicit selection via enum +- **Weight parameters**: Client-configurable, only apply to hybrid mode +- **Validation**: Weights must sum to ≀1.0, enforced server-side +- **Defaults**: Hybrid mode with balanced weights (semantic 50%, keyword 30%, fuzzy 20%) + +### 3. Shared Algorithm Implementation + +Extract search algorithms into reusable module: + +``` +nextcloud_mcp_server/ +β”œβ”€β”€ search/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ algorithms.py # Core search implementations +β”‚ β”œβ”€β”€ semantic.py # Vector similarity search +β”‚ β”œβ”€β”€ keyword.py # Token-based search (ADR-001) +β”‚ β”œβ”€β”€ fuzzy.py # Character overlap search +β”‚ └── hybrid.py # RRF fusion (ADR-003) +└── server/ + └── semantic.py # MCP tool wrapper +``` + +**Benefits**: +- Viz pane and MCP tools share identical implementations +- Testable in isolation +- Easy to add new algorithms (e.g., BM25, neural reranking) +- Clear separation of concerns + +### 4. Viz Pane Integration + +Update viz pane (`nextcloud_mcp_server/auth/userinfo_routes.py`) to: + +1. **Use shared algorithms**: Import from `search/algorithms.py` +2. **Remove client-side filtering**: Call server-side search methods +3. **User accessibility**: Available to all users with vector sync enabled +4. **Security**: Filter results by `user_id` (only show user's own documents) +5. **Interactive testing**: Allow users to: + - Select algorithm type + - Adjust weights (hybrid mode) + - Compare results across algorithms + - Visualize result distribution in 2D space + +### 5. Reciprocal Rank Fusion (RRF) for Hybrid Search + +Following ADR-003's design: + +```python +def reciprocal_rank_fusion( + results: dict[str, list[SearchResult]], + weights: dict[str, float], + k: int = 60 +) -> list[SearchResult]: + """ + Combine multiple ranked result lists using RRF. + + Args: + results: Dict of algorithm_name -> ranked results + weights: Dict of algorithm_name -> weight (0-1) + k: RRF constant (default: 60, standard value) + + Returns: + Combined and re-ranked results + """ + scores = defaultdict(float) + + for algo_name, algo_results in results.items(): + weight = weights.get(algo_name, 0.0) + for rank, result in enumerate(algo_results, start=1): + # RRF formula: 1 / (k + rank) + rrf_score = weight / (k + rank) + scores[result.doc_id] += rrf_score + + # Sort by combined score, return top results + return sorted(scores.items(), key=lambda x: x[1], reverse=True) +``` + +**RRF properties**: +- **Rank-based**: Uses position, not raw scores (handles score scale differences) +- **Proven effective**: Standard approach in information retrieval +- **Configurable**: `k` parameter controls rank decay (default: 60) +- **Weight support**: Allows algorithm-specific importance + +## Implementation Plan + +### Phase 1: Extract and Unify Algorithms (Week 1) + +1. Create `nextcloud_mcp_server/search/` module +2. Implement `algorithms.py` with base interface +3. Extract semantic search logic from `server/semantic.py` +4. Implement keyword search from ADR-001 design +5. Extract fuzzy search from viz pane +6. Implement RRF hybrid search from ADR-003 +7. Add comprehensive unit tests for each algorithm + +### Phase 2: Update MCP Tool (Week 1-2) + +1. Add `algorithm` parameter to `nc_semantic_search()` +2. Add weight parameters (`semantic_weight`, etc.) +3. Implement algorithm dispatcher +4. Add parameter validation (weights sum ≀1.0) +5. Update response model to include algorithm metadata +6. Maintain backward compatibility (default: hybrid) +7. Add integration tests for all algorithm modes + +### Phase 3: Update Viz Pane (Week 2) + +1. Remove client-side search filtering +2. Call shared `search/algorithms.py` methods +3. Add user_id filtering for multi-user security +4. Add algorithm selector dropdown +5. Add weight adjustment controls (sliders) +6. Update visualization to show algorithm-specific metadata +7. Add side-by-side comparison mode + +### Phase 4: Documentation and Testing (Week 2-3) + +1. Update MCP tool documentation +2. Add algorithm selection guide +3. Document weight tuning recommendations +4. Add end-to-end tests (MCP + viz pane) +5. Performance benchmarks for each algorithm +6. Update CLAUDE.md with search patterns + +## Consequences + +### Positive + +1. **Flexibility**: MCP clients can optimize search for their use case +2. **Unified implementation**: Single source of truth for search algorithms +3. **User empowerment**: Viz pane enables query testing and tuning +4. **Backward compatible**: Existing semantic search behavior preserved +5. **Extensible**: Easy to add new algorithms (BM25, neural reranking) +6. **Testable**: Each algorithm can be unit tested independently +7. **Standards-based**: RRF is proven in production systems + +### Negative + +1. **Complexity**: More parameters for clients to understand +2. **API surface**: Larger tool signature (8 parameters) +3. **Performance**: Hybrid search requires multiple queries +4. **Validation overhead**: Weight validation adds processing +5. **Documentation burden**: Need to explain when to use each algorithm + +### Neutral + +1. **Weight defaults**: May need tuning based on user feedback +2. **Algorithm performance**: Will vary by content type and query +3. **Viz pane adoption**: Unknown if users will utilize testing interface + +## Alternatives Considered + +### Alternative 1: Separate Tools Per Algorithm + +```python +@mcp.tool() +async def nc_semantic_search(query: str, ctx: Context, ...) -> SearchResponse: + """Pure vector similarity search.""" + +@mcp.tool() +async def nc_keyword_search(query: str, ctx: Context, ...) -> SearchResponse: + """Pure keyword matching.""" + +@mcp.tool() +async def nc_hybrid_search(query: str, ctx: Context, weights: dict, ...) -> SearchResponse: + """Hybrid search with weights.""" +``` + +**Rejected because**: +- API proliferation (3+ tools instead of 1) +- Harder to discover capabilities +- Backward compatibility issues +- DRY violation (repeated parameters) + +### Alternative 2: Server-Wide Configuration Only + +```python +# .env configuration +SEARCH_ALGORITHM=hybrid +SEMANTIC_WEIGHT=0.5 +KEYWORD_WEIGHT=0.3 +FUZZY_WEIGHT=0.2 +``` + +**Rejected because**: +- No per-query flexibility +- MCP clients cannot optimize for different tasks +- Requires server restart for changes +- User's requirement: "expose a way for users to override the default weights" + +### Alternative 3: Production-Grade Fuzzy (Levenshtein/RapidFuzz) + +**Rejected because**: +- Adds external dependency +- Simple character overlap performs adequately +- Can always upgrade later if needed +- User's preference: "Keep simple character overlap" + +## Related ADRs + +- **ADR-001**: Enhanced Note Search (keyword algorithm design) +- **ADR-003**: Vector Database and Semantic Search (hybrid search + RRF design) +- **ADR-007**: Background Vector Sync (semantic search implementation) +- **ADR-008**: MCP Sampling for RAG (uses semantic search results) +- **ADR-009**: Semantic Search OAuth Scope (security model) +- **ADR-011**: Improving Semantic Search Quality (mentions future "ADR-013" for hybrid search) + +**This ADR supersedes**: +- ADR-011's placeholder for "ADR-013: Hybrid Search" + +**This ADR implements**: +- ADR-003's hybrid search design (previously unimplemented) +- ADR-001's keyword search design (previously unimplemented) + +## References + +- **Reciprocal Rank Fusion**: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). "Reciprocal rank fusion outperforms condorcet and individual rank learning methods." SIGIR '09. +- **Vector Search**: Malkov, Y. A., & Yashunin, D. A. (2018). "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." TPAMI. +- **Hybrid Search Best Practices**: Qdrant documentation on hybrid search patterns +- **MCP Protocol**: Model Context Protocol specification for tool design + +## Implementation Notes + +### Weight Validation + +```python +def validate_weights( + semantic_weight: float, + keyword_weight: float, + fuzzy_weight: float +) -> None: + """Validate hybrid search weights.""" + if semantic_weight < 0 or keyword_weight < 0 or fuzzy_weight < 0: + raise ValueError("Weights must be non-negative") + + total = semantic_weight + keyword_weight + fuzzy_weight + if total > 1.0: + raise ValueError(f"Weights sum to {total:.2f}, must be ≀1.0") + + if total == 0.0: + raise ValueError("At least one weight must be > 0") +``` + +### Backward Compatibility + +The default behavior (`algorithm="hybrid"` with balanced weights) provides better results than current pure semantic search, while maintaining the same tool name and signature structure. Existing clients will automatically benefit from hybrid search without code changes. + +### Performance Considerations + +- **Semantic search**: ~50-200ms (vector DB query) +- **Keyword search**: ~10-50ms (in-memory token matching) +- **Fuzzy search**: ~20-100ms (character comparison) +- **Hybrid search**: ~100-300ms (parallel execution + fusion) + +Parallel execution of algorithms minimizes hybrid search latency. + +### Security Model + +All algorithms respect the same security boundaries: +1. **User filtering**: Qdrant queries filter by `user_id` +2. **Access verification**: Results verified via Nextcloud API +3. **OAuth scope**: `semantic:read` required for all algorithms +4. **Viz pane**: Shows only current user's documents + +## Success Metrics + +1. **Adoption**: % of MCP clients using algorithm parameter +2. **Performance**: Search latency percentiles (p50, p95, p99) +3. **Quality**: User satisfaction with result relevance +4. **Viz pane usage**: % of users accessing testing interface +5. **Weight distribution**: Most common weight configurations + +## Future Enhancements + +1. **Additional algorithms**: BM25, TF-IDF, neural reranking +2. **Auto-tuning**: Learn optimal weights per user +3. **Query analysis**: Automatic algorithm selection based on query +4. **Cross-app search**: Extend beyond notes to calendar, files, etc. +5. **Feedback loop**: Use click-through rate to improve weights From 5e67277049dec9489909e795d247b4a678d7fb2c Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 00:00:40 +0100 Subject: [PATCH 03/17] docs: Add architecture diagrams and viz pane UI to ADR-012 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhances ADR-012 with detailed architecture visualization and UI mockup for the vector visualization pane. Added sections: - Architecture diagram showing MCP tool and viz pane integration - Data flow diagrams for both MCP requests and viz pane interactions - Detailed UI mockup with ASCII art showing: * Search configuration controls * Algorithm selector with weight sliders * Interactive 2D scatter plot (Plotly.js) * Results panel with scores * Performance comparison table - Technology stack details (htmx, Alpine.js, Plotly.js, Tailwind CSS) The diagrams illustrate how the viz pane and MCP tool share the same search algorithm implementations from search/algorithms.py, ensuring consistency between user testing interface and programmatic API. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../ADR-012-unified-multi-algorithm-search.md | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/docs/ADR-012-unified-multi-algorithm-search.md b/docs/ADR-012-unified-multi-algorithm-search.md index a788a6a..1fc3738 100644 --- a/docs/ADR-012-unified-multi-algorithm-search.md +++ b/docs/ADR-012-unified-multi-algorithm-search.md @@ -43,6 +43,105 @@ Additionally, users need a **testing interface** (viz pane) to: We will implement a **unified multi-algorithm search architecture** with the following components: +### Architecture Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ MCP Client / User Browser β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ MCP Tool Call β”‚ β”‚ Viz Pane (Browser UI) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ nc_semantic_search( β”‚ β”‚ - Algorithm selector dropdown β”‚ β”‚ +β”‚ β”‚ query="kubernetes", β”‚ β”‚ - Weight adjustment sliders β”‚ β”‚ +β”‚ β”‚ algorithm="hybrid", β”‚ β”‚ - Interactive 2D scatter plot β”‚ β”‚ +β”‚ β”‚ semantic_weight=0.5, β”‚ β”‚ - Side-by-side comparison β”‚ β”‚ +β”‚ β”‚ keyword_weight=0.3, β”‚ β”‚ - Real-time search testing β”‚ β”‚ +β”‚ β”‚ fuzzy_weight=0.2 β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ ) β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”‚ MCP Protocol β”‚ HTTPS (htmx) + β”‚ β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ MCP Server (/app endpoint) β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Unified Search Interface (server/semantic.py) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ @mcp.tool() nc_semantic_search(algorithm, weights...) β”‚ β”‚ +β”‚ β”‚ β”œβ”€ Validate parameters (weights sum ≀1.0) β”‚ β”‚ +β”‚ β”‚ β”œβ”€ Dispatch to algorithm selector β”‚ β”‚ +β”‚ β”‚ └─ Return ranked SearchResponse β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Algorithm Dispatcher (search/algorithms.py) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ if algorithm == "semantic": β†’ semantic.py β”‚ β”‚ +β”‚ β”‚ if algorithm == "keyword": β†’ keyword.py β”‚ β”‚ +β”‚ β”‚ if algorithm == "fuzzy": β†’ fuzzy.py β”‚ β”‚ +β”‚ β”‚ if algorithm == "hybrid": β†’ hybrid.py (RRF fusion) β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ semantic.py β”‚ β”‚ keyword.py β”‚ β”‚ fuzzy.py β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β€’ Query Qdrant β”‚ β”‚ β€’ Token matching β”‚ β”‚ β€’ Char overlap β”‚ β”‚ +β”‚ β”‚ β€’ Cosine dist β”‚ β”‚ β€’ Title weight β”‚ β”‚ β€’ 70% threshold β”‚ β”‚ +β”‚ β”‚ β€’ Score β‰₯0.7 β”‚ β”‚ β€’ ADR-001 logic β”‚ β”‚ β€’ Simple impl β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ hybrid.py (Reciprocal Rank Fusion) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ 1. Run algorithms in parallel (semantic, keyword, fuzzy) β”‚ β”‚ +β”‚ β”‚ 2. Collect ranked results from each β”‚ β”‚ +β”‚ β”‚ 3. Apply RRF formula: score = weight / (k + rank) β”‚ β”‚ +β”‚ β”‚ 4. Combine scores across algorithms β”‚ β”‚ +β”‚ β”‚ 5. Re-rank by combined score β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Qdrant Vector DB β”‚ β”‚ Nextcloud APIs β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β€’ Vector search β”‚ β”‚ β€’ Access verificationβ”‚ + β”‚ β€’ user_id filter β”‚ β”‚ β€’ Full metadata fetchβ”‚ + β”‚ β€’ Score threshold β”‚ β”‚ β€’ Permission checks β”‚ + β”‚ β€’ 768-dim embeddingsβ”‚ β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### Data Flow + +#### MCP Tool Request +``` +1. Client calls nc_semantic_search(query, algorithm="hybrid", weights...) +2. Server validates parameters (weights sum ≀1.0) +3. Dispatcher routes to hybrid.py +4. Hybrid search runs semantic, keyword, fuzzy in parallel +5. RRF combines results with weighted scores +6. Access verification via Nextcloud API +7. Return ranked SearchResponse to client +``` + +#### Viz Pane Request +``` +1. User navigates to /app (Vector Visualization tab) +2. Browser loads vector-viz fragment via htmx +3. User adjusts algorithm selector and weight sliders +4. JavaScript calls same search/algorithms.py backend +5. PCA reduces vectors to 2D for visualization +6. Plotly.js renders interactive scatter plot +7. Matching results highlighted, non-matches grayed out +``` + ### 1. Core Search Algorithms Four search algorithms will be available: @@ -148,6 +247,98 @@ Update viz pane (`nextcloud_mcp_server/auth/userinfo_routes.py`) to: - Compare results across algorithms - Visualize result distribution in 2D space +#### Viz Pane UI Components + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Vector Visualization [Status] β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Search Configuration β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Query: [_______________________________________________] [Search]β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Algorithm: [Hybrid β–Ό] [Semantic] [Keyword] [Fuzzy] β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Weights (Hybrid Mode): β”‚ β”‚ +β”‚ β”‚ Semantic: [========50========] 0.5 β”‚ β”‚ +β”‚ β”‚ Keyword: [======30====== ] 0.3 β”‚ β”‚ +β”‚ β”‚ Fuzzy: [====20==== ] 0.2 β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Document Types: β˜‘ Notes β˜‘ Files β˜‘ Calendar β˜‘ Contacts β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Vector Space Visualization (PCA 2D Projection) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ β–² β”‚ β”‚ +β”‚ β”‚ PC2 β”‚ ● ● ● πŸ”΅ Matching results (full opacity) β”‚ β”‚ +β”‚ β”‚ β”‚ ● ● ● βšͺ Non-matching results (40% opacity) β”‚ β”‚ +β”‚ β”‚ β”‚ πŸ”΅ ● ● β”‚ β”‚ +β”‚ β”‚ β”‚ ● πŸ”΅ ● Hover: Show document title + excerpt β”‚ β”‚ +β”‚ β”‚ β”‚ ● ● πŸ”΅ ● Click: Open document in Nextcloud β”‚ β”‚ +β”‚ β”‚ β”€β”€β”€β”€β”Όβ”€β”€β—β”€πŸ”΅β”€β”€β—β”€β—β”€β”€β”€β”€β–Ί PC1 β”‚ β”‚ +β”‚ β”‚ β”‚ ● ● ● β”‚ β”‚ +β”‚ β”‚ β”‚ πŸ”΅ ● ● Explained Variance: β”‚ β”‚ +β”‚ β”‚ β”‚ ● ● ● PC1: 23.4% | PC2: 18.7% β”‚ β”‚ +β”‚ β”‚ β”‚ ● ● β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Search Results (12 matching documents) β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ πŸ”΅ Kubernetes Setup Guide Score: 0.87 β”‚ β”‚ +β”‚ β”‚ "...configure kubectl to connect to cluster..." β”‚ β”‚ +β”‚ β”‚ [Open in Nextcloud] β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ πŸ”΅ Container Orchestration Notes Score: 0.82 β”‚ β”‚ +β”‚ β”‚ "...deployment strategies for kubernetes..." β”‚ β”‚ +β”‚ β”‚ [Open in Nextcloud] β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ πŸ”΅ K8s Troubleshooting Score: 0.79 β”‚ β”‚ +β”‚ β”‚ "...common kuberntes errors and solutions..." β”‚ β”‚ +β”‚ β”‚ [Open in Nextcloud] β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ [Show More Results...] β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Algorithm Performance Comparison β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ β”‚ Algorithm β”‚ Results β”‚ Avg Score β”‚ Time (ms) β”‚ Precision β”‚ β”‚ +β”‚ β”‚ ─────────────┼─────────┼───────────┼───────────┼─────────── β”‚ β”‚ +β”‚ β”‚ Semantic β”‚ 45 β”‚ 0.78 β”‚ 145ms β”‚ β–ˆβ–ˆβ–ˆβ–ˆβ–‘ 0.82 β”‚ β”‚ +β”‚ β”‚ Keyword β”‚ 23 β”‚ 0.91 β”‚ 42ms β”‚ β–ˆβ–ˆβ–ˆβ–‘β–‘ 0.67 β”‚ β”‚ +β”‚ β”‚ Fuzzy β”‚ 67 β”‚ 0.72 β”‚ 89ms β”‚ β–ˆβ–ˆβ–‘β–‘β–‘ 0.45 β”‚ β”‚ +β”‚ β”‚ Hybrid (RRF) β”‚ 52 β”‚ 0.84 β”‚ 198ms β”‚ β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ 0.89 β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Key UI Features**: + +1. **Search Input**: Real-time query testing with instant visualization +2. **Algorithm Selector**: Dropdown + quick-select buttons +3. **Weight Sliders**: Visual adjustment with live preview (hybrid mode only) +4. **Document Type Filters**: Checkboxes for notes, files, calendar, contacts +5. **2D Scatter Plot**: Interactive Plotly.js visualization + - Blue dots = matching documents (full opacity) + - Gray dots = non-matching documents (40% opacity) + - Hover = show title + excerpt tooltip + - Click = open document in Nextcloud + - Zoom/pan controls for exploration +6. **Results Panel**: Ranked list with scores and excerpts +7. **Performance Table**: Compare algorithm speed and accuracy +8. **Explained Variance**: Show how much information PCA preserves + +**Technology Stack**: +- **Frontend**: htmx for dynamic loading, Alpine.js for reactivity +- **Visualization**: Plotly.js for interactive scatter plots +- **Styling**: Tailwind CSS (consistent with existing /app UI) +- **Backend**: Shared `search/algorithms.py` implementation + ### 5. Reciprocal Rank Fusion (RRF) for Hybrid Search Following ADR-003's design: From 56bd85c0f7a700fbe4f941fd43328bc6331751d3 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 00:02:54 +0100 Subject: [PATCH 04/17] docs: Emphasize server-side processing in ADR-012 viz pane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates ADR-012 to clarify that all search and filtering operations must happen server-side, not in the browser. Key changes: - Enhanced viz pane data flow showing server-side processing - Added performance benefits section (384x bandwidth reduction) - Detailed server-side filtering approach: * Query execution via search/algorithms.py * User ID filtering (multi-tenant security) * Document type filtering * PCA reduction (768-dim β†’ 2D) on server * Only 2D coordinates + metadata sent to client - Updated Phase 3 implementation plan: * Remove ALL client-side search logic * Implement /app/vector-viz server endpoint * htmx form submission for queries * Performance optimizations (caching, streaming) This ensures: - Minimal bandwidth usage (only 2 floats per doc vs 768) - Client handles only visualization, not computation - Can visualize 10,000+ documents without client lag - Raw vectors never leave server (security) - Same search logic as MCP tool (consistency) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../ADR-012-unified-multi-algorithm-search.md | 76 ++++++++++++++----- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/docs/ADR-012-unified-multi-algorithm-search.md b/docs/ADR-012-unified-multi-algorithm-search.md index 1fc3738..9acd85a 100644 --- a/docs/ADR-012-unified-multi-algorithm-search.md +++ b/docs/ADR-012-unified-multi-algorithm-search.md @@ -131,17 +131,34 @@ We will implement a **unified multi-algorithm search architecture** with the fol 7. Return ranked SearchResponse to client ``` -#### Viz Pane Request +#### Viz Pane Request (Server-Side Processing) ``` 1. User navigates to /app (Vector Visualization tab) 2. Browser loads vector-viz fragment via htmx -3. User adjusts algorithm selector and weight sliders -4. JavaScript calls same search/algorithms.py backend -5. PCA reduces vectors to 2D for visualization -6. Plotly.js renders interactive scatter plot -7. Matching results highlighted, non-matches grayed out +3. User enters query and adjusts algorithm/weights +4. htmx sends request to /app/vector-viz endpoint +5. Server executes search via search/algorithms.py: + - Filters by user_id (multi-tenant security) + - Applies selected algorithm (semantic/keyword/fuzzy/hybrid) + - Filters by document type (notes/files/calendar/contacts) + - Retrieves matching results + metadata +6. Server performs PCA reduction (768-dim β†’ 2D): + - Converts matching results to 2D coordinates + - Only sends coordinates + metadata (not full vectors) + - Dramatically reduces bandwidth (e.g., 768 floats β†’ 2 floats per doc) +7. Server returns JSON: {results: [...], coordinates_2d: [...], stats: {...}} +8. Browser receives lightweight response +9. Plotly.js renders interactive scatter plot +10. Matching results highlighted (blue), non-matches grayed (40% opacity) ``` +**Performance Benefits of Server-Side Processing**: +- **Bandwidth reduction**: ~384x less data (2 floats vs 768 floats per document) +- **Client efficiency**: Browser only handles visualization, not computation +- **Scalability**: Can visualize 10,000+ documents without client-side lag +- **Security**: Raw vectors never leave server +- **Consistency**: Same search logic as MCP tool (no drift) + ### 1. Core Search Algorithms Four search algorithms will be available: @@ -238,10 +255,19 @@ nextcloud_mcp_server/ Update viz pane (`nextcloud_mcp_server/auth/userinfo_routes.py`) to: 1. **Use shared algorithms**: Import from `search/algorithms.py` -2. **Remove client-side filtering**: Call server-side search methods -3. **User accessibility**: Available to all users with vector sync enabled -4. **Security**: Filter results by `user_id` (only show user's own documents) -5. **Interactive testing**: Allow users to: +2. **Server-side filtering**: All search and filtering operations happen server-side + - Query execution via shared search backend + - Document type filtering (notes, files, calendar, contacts) + - User ID filtering for multi-tenant security + - Only matching results + metadata sent to client + - Reduces bandwidth and improves performance +3. **PCA reduction**: Server performs dimensionality reduction (768-dim β†’ 2D) + - Only 2D coordinates sent to browser for visualization + - Dramatically reduces data transfer vs sending full vectors + - Enables visualization of large document collections +4. **User accessibility**: Available to all users with vector sync enabled +5. **Security**: Filter results by `user_id` (only show user's own documents) +6. **Interactive testing**: Allow users to: - Select algorithm type - Adjust weights (hybrid mode) - Compare results across algorithms @@ -403,13 +429,29 @@ def reciprocal_rank_fusion( ### Phase 3: Update Viz Pane (Week 2) -1. Remove client-side search filtering -2. Call shared `search/algorithms.py` methods -3. Add user_id filtering for multi-user security -4. Add algorithm selector dropdown -5. Add weight adjustment controls (sliders) -6. Update visualization to show algorithm-specific metadata -7. Add side-by-side comparison mode +**Critical: All processing must happen server-side** + +1. **Remove client-side search filtering** + - Delete JavaScript-based keyword/fuzzy matching + - Remove client-side document type filtering + - No search logic in browser +2. **Implement server-side endpoint** (`/app/vector-viz`) + - Accept query, algorithm, weights, doc_type filters + - Execute search via `search/algorithms.py` + - Filter results by user_id (security) + - Perform PCA reduction (768-dim β†’ 2D) + - Return JSON with 2D coordinates + metadata only +3. **Update frontend** + - htmx form submission to `/app/vector-viz` + - Algorithm selector dropdown + - Weight adjustment sliders (htmx updates on change) + - Document type checkboxes + - Plotly.js visualization of server response +4. **Performance optimization** + - Limit results to user's documents only + - Cache PCA transformation (invalidate on new vectors) + - Stream large result sets if needed + - Add loading indicators for server processing ### Phase 4: Documentation and Testing (Week 2-3) From 11e620f2d17bb4366802e0c991f45ce33dca03e7 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 00:10:19 +0100 Subject: [PATCH 05/17] feat: Implement unified search algorithm module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creates shared search module with four algorithms implementing ADR-012: - Semantic search (vector similarity via Qdrant) - Keyword search (token-based matching from ADR-001) - Fuzzy search (character overlap matching) - Hybrid search (RRF fusion from ADR-003) Architecture: - Base SearchAlgorithm interface for consistent API - SearchResult dataclass for unified result format - All algorithms async and independently testable - Proper logging and error handling throughout Semantic Search (search/semantic.py): - Extracted from server/semantic.py - Vector similarity using Qdrant query_points - Dual-phase authorization (vector filter + API verification) - Deduplication of document chunks - Configurable score threshold (default: 0.7) Keyword Search (search/keyword.py): - Implements ADR-001 token-based matching - Title matches weighted 3x higher than content - Case-insensitive token matching - Relevance scoring with normalization - Excerpt extraction with context Fuzzy Search (search/fuzzy.py): - Simple character overlap calculation - Configurable threshold (default: 70%) - Typo-tolerant matching - Fast and dependency-free Hybrid Search (search/hybrid.py): - Reciprocal Rank Fusion (RRF) from ADR-003 - Parallel execution of sub-algorithms - Configurable weights per algorithm - RRF constant k=60 (standard value) - Weight validation (must sum ≀1.0) All algorithms: - Share NextcloudClient for document access - Support user_id filtering (multi-tenant) - Support doc_type filtering (currently notes only) - Return consistent SearchResult objects - Properly formatted with ruff and type-checked Next steps: Update MCP tool to use these algorithms πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/search/__init__.py | 26 +++ nextcloud_mcp_server/search/algorithms.py | 87 ++++++++ nextcloud_mcp_server/search/fuzzy.py | 174 ++++++++++++++++ nextcloud_mcp_server/search/hybrid.py | 240 ++++++++++++++++++++++ nextcloud_mcp_server/search/keyword.py | 225 ++++++++++++++++++++ nextcloud_mcp_server/search/semantic.py | 229 +++++++++++++++++++++ 6 files changed, 981 insertions(+) create mode 100644 nextcloud_mcp_server/search/__init__.py create mode 100644 nextcloud_mcp_server/search/algorithms.py create mode 100644 nextcloud_mcp_server/search/fuzzy.py create mode 100644 nextcloud_mcp_server/search/hybrid.py create mode 100644 nextcloud_mcp_server/search/keyword.py create mode 100644 nextcloud_mcp_server/search/semantic.py diff --git a/nextcloud_mcp_server/search/__init__.py b/nextcloud_mcp_server/search/__init__.py new file mode 100644 index 0000000..1da5a84 --- /dev/null +++ b/nextcloud_mcp_server/search/__init__.py @@ -0,0 +1,26 @@ +"""Search algorithms module for unified multi-algorithm search. + +This module provides a unified interface for different search algorithms: +- Semantic search (vector similarity) +- Keyword search (token-based matching) +- Fuzzy search (character overlap) +- Hybrid search (RRF fusion of multiple algorithms) + +All algorithms share the same interface and can be used interchangeably by both +MCP tools and the visualization pane. +""" + +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm +from nextcloud_mcp_server.search.hybrid import HybridSearchAlgorithm +from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm +from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm + +__all__ = [ + "SearchAlgorithm", + "SearchResult", + "SemanticSearchAlgorithm", + "KeywordSearchAlgorithm", + "FuzzySearchAlgorithm", + "HybridSearchAlgorithm", +] diff --git a/nextcloud_mcp_server/search/algorithms.py b/nextcloud_mcp_server/search/algorithms.py new file mode 100644 index 0000000..560e113 --- /dev/null +++ b/nextcloud_mcp_server/search/algorithms.py @@ -0,0 +1,87 @@ +"""Base interfaces and data structures for search algorithms.""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + + +@dataclass +class SearchResult: + """A single search result with metadata and score. + + Attributes: + id: Document ID + doc_type: Document type (note, file, calendar, contact, etc.) + title: Document title + excerpt: Content excerpt showing match context + score: Relevance score (0.0-1.0, higher is better) + metadata: Additional algorithm-specific metadata + """ + + id: int + doc_type: str + title: str + excerpt: str + score: float + metadata: dict[str, Any] | None = None + + def __post_init__(self): + """Validate score is in valid range.""" + if not 0.0 <= self.score <= 1.0: + raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}") + + +class SearchAlgorithm(ABC): + """Abstract base class for search algorithms. + + All search algorithms must implement the search() method with consistent + interface, allowing them to be used interchangeably. + """ + + @abstractmethod + async def search( + self, + query: str, + user_id: str, + limit: int = 10, + doc_type: str | None = None, + **kwargs: Any, + ) -> list[SearchResult]: + """Execute search with the given parameters. + + Args: + query: Search query string + user_id: User ID for multi-tenant filtering + limit: Maximum number of results to return + doc_type: Optional document type filter (note, file, calendar, etc.) + **kwargs: Algorithm-specific parameters + + Returns: + List of SearchResult objects ranked by relevance + + Raises: + McpError: If search fails or configuration is invalid + """ + pass + + @property + @abstractmethod + def name(self) -> str: + """Return algorithm name for identification.""" + pass + + @property + def supports_scoring(self) -> bool: + """Whether this algorithm provides meaningful relevance scores. + + Default: True. Override if algorithm doesn't support scoring. + """ + return True + + @property + def requires_vector_db(self) -> bool: + """Whether this algorithm requires vector database. + + Default: False. Override for semantic search. + """ + return False diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py new file mode 100644 index 0000000..479459f --- /dev/null +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -0,0 +1,174 @@ +"""Fuzzy search algorithm using character overlap matching.""" + +import logging +from typing import Any + +from nextcloud_mcp_server.client import NextcloudClient +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult + +logger = logging.getLogger(__name__) + + +class FuzzySearchAlgorithm(SearchAlgorithm): + """Fuzzy search using simple character-based similarity. + + Implements character overlap matching with configurable threshold: + - Compares character sets between query and text + - Requires configurable % character overlap to match (default: 70%) + - Tolerant to typos and minor variations + """ + + def __init__(self, threshold: float = 0.7): + """Initialize fuzzy search algorithm. + + Args: + threshold: Minimum character overlap ratio (0-1, default: 0.7) + """ + if not 0.0 <= threshold <= 1.0: + raise ValueError(f"Threshold must be between 0.0 and 1.0, got {threshold}") + self.threshold = threshold + + @property + def name(self) -> str: + return "fuzzy" + + async def search( + self, + query: str, + user_id: str, + limit: int = 10, + doc_type: str | None = None, + nextcloud_client: NextcloudClient | None = None, + **kwargs: Any, + ) -> list[SearchResult]: + """Execute fuzzy search using character overlap. + + Args: + query: Search query + user_id: User ID for filtering + limit: Maximum results to return + doc_type: Optional document type filter (currently only "note" supported) + nextcloud_client: NextcloudClient for fetching documents + **kwargs: Additional parameters (threshold override) + + Returns: + List of SearchResult objects ranked by character overlap score + + Raises: + ValueError: If nextcloud_client not provided + """ + if not nextcloud_client: + raise ValueError("FuzzySearch requires nextcloud_client parameter") + + threshold = kwargs.get("threshold", self.threshold) + + logger.info( + f"Fuzzy search: query='{query}', user={user_id}, " + f"limit={limit}, threshold={threshold}, doc_type={doc_type}" + ) + + # Currently only supports notes + if doc_type and doc_type != "note": + logger.warning(f"Fuzzy search not yet implemented for doc_type={doc_type}") + return [] + + # Fetch all notes for the user + notes = await nextcloud_client.notes.get_notes() + logger.debug(f"Fetched {len(notes)} notes for fuzzy search") + + # Score and filter notes + scored_notes = [] + query_lower = query.lower() + + for note in notes: + title = note.get("title", "") + content = note.get("content", "") + + # Check title match + title_score = self._calculate_char_overlap(query_lower, title.lower()) + + # Check content match + content_score = self._calculate_char_overlap(query_lower, content.lower()) + + # Use best score + best_score = max(title_score, content_score) + + if best_score >= threshold: + # Extract excerpt based on which matched better + if title_score >= content_score: + excerpt = f"Title match: {title}" + else: + excerpt = self._extract_excerpt(content, max_length=200) + + scored_notes.append( + SearchResult( + id=note["id"], + doc_type="note", + title=title or "Untitled", + excerpt=excerpt, + score=best_score, + metadata={ + "category": note.get("category", ""), + "modified": note.get("modified"), + "match_location": "title" + if title_score >= content_score + else "content", + }, + ) + ) + + # Sort by score (descending) and limit + scored_notes.sort(key=lambda x: x.score, reverse=True) + results = scored_notes[:limit] + + logger.info(f"Fuzzy search returned {len(results)} matching notes") + if results: + result_details = [ + f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in results[:5] + ] + logger.debug(f"Top fuzzy results: {', '.join(result_details)}") + + return results + + def _calculate_char_overlap(self, query: str, text: str) -> float: + """Calculate character overlap ratio between query and text. + + Args: + query: Query string (normalized) + text: Text to compare (normalized) + + Returns: + Overlap ratio (0.0-1.0) + """ + if not query or not text: + return 0.0 + + # Convert to character sets + query_chars = set(query) + text_chars = set(text) + + # Calculate overlap + overlap = query_chars & text_chars + overlap_ratio = len(overlap) / len(query_chars) + + return overlap_ratio + + def _extract_excerpt(self, content: str, max_length: int = 200) -> str: + """Extract excerpt from content. + + Args: + content: Full document content + max_length: Maximum excerpt length + + Returns: + Excerpt string + """ + if not content: + return "" + + excerpt = content[:max_length].strip() + if len(content) > max_length: + excerpt += "..." + + return excerpt diff --git a/nextcloud_mcp_server/search/hybrid.py b/nextcloud_mcp_server/search/hybrid.py new file mode 100644 index 0000000..a8778c8 --- /dev/null +++ b/nextcloud_mcp_server/search/hybrid.py @@ -0,0 +1,240 @@ +"""Hybrid search algorithm using Reciprocal Rank Fusion (RRF).""" + +import asyncio +import logging +from collections import defaultdict +from typing import Any + +from nextcloud_mcp_server.client import NextcloudClient +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm +from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm +from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm + +logger = logging.getLogger(__name__) + + +class HybridSearchAlgorithm(SearchAlgorithm): + """Hybrid search combining multiple algorithms using Reciprocal Rank Fusion. + + Implements RRF from ADR-003 to combine results from: + - Semantic search (vector similarity) + - Keyword search (token matching) + - Fuzzy search (character overlap) + + RRF formula: score = weight / (k + rank) + where k=60 (standard value) and rank is 1-indexed position. + """ + + DEFAULT_RRF_K = 60 # Standard RRF constant + + def __init__( + self, + semantic_weight: float = 0.5, + keyword_weight: float = 0.3, + fuzzy_weight: float = 0.2, + rrf_k: int = DEFAULT_RRF_K, + ): + """Initialize hybrid search with algorithm weights. + + Args: + semantic_weight: Weight for semantic results (default: 0.5) + keyword_weight: Weight for keyword results (default: 0.3) + fuzzy_weight: Weight for fuzzy results (default: 0.2) + rrf_k: RRF constant for rank decay (default: 60) + + Raises: + ValueError: If weights are invalid + """ + # Validate weights + if semantic_weight < 0 or keyword_weight < 0 or fuzzy_weight < 0: + raise ValueError("Weights must be non-negative") + + total_weight = semantic_weight + keyword_weight + fuzzy_weight + if total_weight > 1.0: + raise ValueError(f"Weights sum to {total_weight:.2f}, must be ≀1.0") + + if total_weight == 0.0: + raise ValueError("At least one weight must be > 0") + + self.semantic_weight = semantic_weight + self.keyword_weight = keyword_weight + self.fuzzy_weight = fuzzy_weight + self.rrf_k = rrf_k + + # Initialize sub-algorithms + self.semantic = SemanticSearchAlgorithm() + self.keyword = KeywordSearchAlgorithm() + self.fuzzy = FuzzySearchAlgorithm() + + @property + def name(self) -> str: + return "hybrid" + + @property + def requires_vector_db(self) -> bool: + # Requires vector DB if semantic search has non-zero weight + return self.semantic_weight > 0 + + async def search( + self, + query: str, + user_id: str, + limit: int = 10, + doc_type: str | None = None, + nextcloud_client: NextcloudClient | None = None, + **kwargs: Any, + ) -> list[SearchResult]: + """Execute hybrid search using RRF to combine algorithms. + + Args: + query: Search query + user_id: User ID for filtering + limit: Maximum results to return + doc_type: Optional document type filter + nextcloud_client: NextcloudClient for document access + **kwargs: Additional parameters passed to sub-algorithms + + Returns: + List of SearchResult objects ranked by RRF combined score + + Raises: + ValueError: If nextcloud_client not provided (needed for keyword/fuzzy) + """ + logger.info( + f"Hybrid search: query='{query}', user={user_id}, limit={limit}, " + f"weights=(semantic={self.semantic_weight}, keyword={self.keyword_weight}, " + f"fuzzy={self.fuzzy_weight})" + ) + + # Run algorithms in parallel + tasks = [] + algo_names = [] + + if self.semantic_weight > 0: + tasks.append( + self.semantic.search( + query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs + ) + ) + algo_names.append("semantic") + + if self.keyword_weight > 0: + if not nextcloud_client: + raise ValueError("Hybrid search with keyword requires nextcloud_client") + tasks.append( + self.keyword.search( + query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs + ) + ) + algo_names.append("keyword") + + if self.fuzzy_weight > 0: + if not nextcloud_client: + raise ValueError("Hybrid search with fuzzy requires nextcloud_client") + tasks.append( + self.fuzzy.search( + query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs + ) + ) + algo_names.append("fuzzy") + + # Execute searches in parallel + results_list = await asyncio.gather(*tasks) + + # Build results dict + algo_results = {} + for algo_name, results in zip(algo_names, results_list): + algo_results[algo_name] = results + logger.debug(f"{algo_name} returned {len(results)} results") + + # Combine using RRF + combined_results = self._reciprocal_rank_fusion( + algo_results, + { + "semantic": self.semantic_weight, + "keyword": self.keyword_weight, + "fuzzy": self.fuzzy_weight, + }, + limit, + ) + + logger.info(f"Hybrid search returned {len(combined_results)} combined results") + if combined_results: + result_details = [ + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in combined_results[:5] + ] + logger.debug(f"Top hybrid results: {', '.join(result_details)}") + + return combined_results + + def _reciprocal_rank_fusion( + self, + algo_results: dict[str, list[SearchResult]], + weights: dict[str, float], + limit: int, + ) -> list[SearchResult]: + """Combine multiple ranked result lists using RRF. + + Args: + algo_results: Dict of algorithm_name -> ranked results + weights: Dict of algorithm_name -> weight (0-1) + limit: Maximum results to return + + Returns: + Combined and re-ranked results + """ + # Track RRF scores per document + rrf_scores: dict[tuple[int, str], float] = defaultdict(float) + # Track best result object for each document + best_results: dict[tuple[int, str], SearchResult] = {} + + for algo_name, results in algo_results.items(): + weight = weights.get(algo_name, 0.0) + if weight == 0: + continue + + for rank, result in enumerate(results, start=1): + doc_key = (result.id, result.doc_type) + + # RRF formula: weight / (k + rank) + rrf_score = weight / (self.rrf_k + rank) + rrf_scores[doc_key] += rrf_score + + # Track best result object (prefer higher original scores) + if doc_key not in best_results: + best_results[doc_key] = result + elif result.score > best_results[doc_key].score: + best_results[doc_key] = result + + # Sort by combined RRF score + sorted_docs = sorted( + rrf_scores.items(), + key=lambda x: x[1], + reverse=True, + )[:limit] + + # Build final results with RRF scores + final_results = [] + for doc_key, rrf_score in sorted_docs: + result = best_results[doc_key] + + # Create new result with RRF score + # Keep original metadata but add RRF details + metadata = result.metadata or {} + metadata["rrf_score"] = rrf_score + metadata["original_score"] = result.score + + final_results.append( + SearchResult( + id=result.id, + doc_type=result.doc_type, + title=result.title, + excerpt=result.excerpt, + score=rrf_score, # Use RRF score as the primary score + metadata=metadata, + ) + ) + + return final_results diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py new file mode 100644 index 0000000..410a7a7 --- /dev/null +++ b/nextcloud_mcp_server/search/keyword.py @@ -0,0 +1,225 @@ +"""Keyword search algorithm using token-based matching (ADR-001).""" + +import logging +from typing import Any + +from nextcloud_mcp_server.client import NextcloudClient +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult + +logger = logging.getLogger(__name__) + + +class KeywordSearchAlgorithm(SearchAlgorithm): + """Keyword search using token-based matching with weighted scoring. + + Implements token-based search from ADR-001: + - Title matches weighted 3x higher than content matches + - Case-insensitive token matching + - Relevance scoring based on match frequency and location + """ + + # Weighting constants from ADR-001 + TITLE_WEIGHT = 3.0 + CONTENT_WEIGHT = 1.0 + + @property + def name(self) -> str: + return "keyword" + + async def search( + self, + query: str, + user_id: str, + limit: int = 10, + doc_type: str | None = None, + nextcloud_client: NextcloudClient | None = None, + **kwargs: Any, + ) -> list[SearchResult]: + """Execute keyword search using token matching. + + Args: + query: Search query to tokenize and match + user_id: User ID for filtering + limit: Maximum results to return + doc_type: Optional document type filter (currently only "note" supported) + nextcloud_client: NextcloudClient for fetching documents + **kwargs: Additional parameters (unused) + + Returns: + List of SearchResult objects ranked by keyword match score + + Raises: + ValueError: If nextcloud_client not provided + """ + if not nextcloud_client: + raise ValueError("KeywordSearch requires nextcloud_client parameter") + + logger.info( + f"Keyword search: query='{query}', user={user_id}, " + f"limit={limit}, doc_type={doc_type}" + ) + + # Tokenize query + query_tokens = self._process_query(query) + logger.debug(f"Query tokens: {query_tokens}") + + # Currently only supports notes + # TODO: Extend to other document types (files, calendar, etc.) + if doc_type and doc_type != "note": + logger.warning( + f"Keyword search not yet implemented for doc_type={doc_type}" + ) + return [] + + # Fetch all notes for the user + notes = await nextcloud_client.notes.get_notes() + logger.debug(f"Fetched {len(notes)} notes for keyword search") + + # Score and filter notes + scored_notes = [] + for note in notes: + score = self._calculate_score( + query_tokens, + note.get("title", ""), + note.get("content", ""), + ) + + if score > 0: # Only include matches + # Extract excerpt with context + excerpt = self._extract_excerpt( + note.get("content", ""), + query_tokens, + max_length=200, + ) + + scored_notes.append( + SearchResult( + id=note["id"], + doc_type="note", + title=note.get("title", "Untitled"), + excerpt=excerpt, + score=score, + metadata={ + "category": note.get("category", ""), + "modified": note.get("modified"), + }, + ) + ) + + # Sort by score (descending) and limit + scored_notes.sort(key=lambda x: x.score, reverse=True) + results = scored_notes[:limit] + + logger.info(f"Keyword search returned {len(results)} matching notes") + if results: + result_details = [ + f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in results[:5] + ] + logger.debug(f"Top keyword results: {', '.join(result_details)}") + + return results + + def _process_query(self, query: str) -> list[str]: + """Tokenize and normalize query. + + Args: + query: Raw query string + + Returns: + List of normalized tokens + """ + # Convert to lowercase and split into tokens + tokens = query.lower().split() + + # Filter out very short tokens (optional) + tokens = [token for token in tokens if len(token) > 1] + + return tokens + + def _calculate_score( + self, query_tokens: list[str], title: str, content: str + ) -> float: + """Calculate relevance score based on token matches. + + Args: + query_tokens: List of query tokens + title: Document title + content: Document content + + Returns: + Relevance score (0.0-1.0) + """ + if not query_tokens: + return 0.0 + + # Process title and content + title_tokens = title.lower().split() + content_tokens = content.lower().split() + + score = 0.0 + + # Count matches in title + title_matches = sum(1 for qt in query_tokens if qt in title_tokens) + if query_tokens: # Avoid division by zero + title_match_ratio = title_matches / len(query_tokens) + score += self.TITLE_WEIGHT * title_match_ratio + + # Count matches in content + content_matches = sum(1 for qt in query_tokens if qt in content_tokens) + if query_tokens: + content_match_ratio = content_matches / len(query_tokens) + score += self.CONTENT_WEIGHT * content_match_ratio + + # Normalize score to 0-1 range + # Max score would be TITLE_WEIGHT + CONTENT_WEIGHT if all tokens match everywhere + max_score = self.TITLE_WEIGHT + self.CONTENT_WEIGHT + normalized_score = min(score / max_score, 1.0) + + return normalized_score + + def _extract_excerpt( + self, content: str, query_tokens: list[str], max_length: int = 200 + ) -> str: + """Extract excerpt showing match context. + + Args: + content: Full document content + query_tokens: Query tokens to find + max_length: Maximum excerpt length in characters + + Returns: + Excerpt string with context around matches + """ + if not content: + return "" + + content_lower = content.lower() + + # Find first occurrence of any query token + first_match_pos = -1 + for token in query_tokens: + pos = content_lower.find(token) + if pos != -1: + if first_match_pos == -1 or pos < first_match_pos: + first_match_pos = pos + + if first_match_pos == -1: + # No matches found, return beginning + return content[:max_length].strip() + ( + "..." if len(content) > max_length else "" + ) + + # Extract context around match + start = max(0, first_match_pos - max_length // 2) + end = min(len(content), first_match_pos + max_length // 2) + + excerpt = content[start:end].strip() + + # Add ellipsis if truncated + if start > 0: + excerpt = "..." + excerpt + if end < len(content): + excerpt = excerpt + "..." + + return excerpt diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py new file mode 100644 index 0000000..c6e632d --- /dev/null +++ b/nextcloud_mcp_server/search/semantic.py @@ -0,0 +1,229 @@ +"""Semantic search algorithm using vector similarity (Qdrant).""" + +import logging +from typing import Any + +from httpx import HTTPStatusError +from qdrant_client.models import FieldCondition, Filter, MatchValue + +from nextcloud_mcp_server.client import NextcloudClient +from nextcloud_mcp_server.config import get_settings +from nextcloud_mcp_server.embedding import get_embedding_service +from nextcloud_mcp_server.observability.metrics import record_qdrant_operation +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client + +logger = logging.getLogger(__name__) + + +class SemanticSearchAlgorithm(SearchAlgorithm): + """Semantic search using vector similarity in Qdrant. + + Searches documents by meaning rather than exact keywords using + 768-dimensional embeddings and cosine distance. + """ + + def __init__(self, score_threshold: float = 0.7): + """Initialize semantic search algorithm. + + Args: + score_threshold: Minimum similarity score (0-1, default: 0.7) + """ + self.score_threshold = score_threshold + + @property + def name(self) -> str: + return "semantic" + + @property + def requires_vector_db(self) -> bool: + return True + + async def search( + self, + query: str, + user_id: str, + limit: int = 10, + doc_type: str | None = None, + nextcloud_client: NextcloudClient | None = None, + **kwargs: Any, + ) -> list[SearchResult]: + """Execute semantic search using vector similarity. + + Args: + query: Natural language search query + user_id: User ID for filtering + limit: Maximum results to return + doc_type: Optional document type filter (currently only "note" supported) + nextcloud_client: NextcloudClient for access verification + **kwargs: Additional parameters (score_threshold override) + + Returns: + List of SearchResult objects ranked by similarity score + + Raises: + McpError: If vector sync is not enabled or search fails + """ + settings = get_settings() + score_threshold = kwargs.get("score_threshold", self.score_threshold) + + logger.info( + f"Semantic search: query='{query}', user={user_id}, " + f"limit={limit}, score_threshold={score_threshold}, doc_type={doc_type}" + ) + + # Generate embedding for query + embedding_service = get_embedding_service() + query_embedding = await embedding_service.embed(query) + logger.debug( + f"Generated embedding for query (dimension={len(query_embedding)})" + ) + + # Build Qdrant filter + filter_conditions = [ + FieldCondition( + key="user_id", + match=MatchValue(value=user_id), + ) + ] + + # Add doc_type filter if specified + if doc_type: + filter_conditions.append( + FieldCondition( + key="doc_type", + match=MatchValue(value=doc_type), + ) + ) + + # Search Qdrant + qdrant_client = await get_qdrant_client() + try: + search_response = await qdrant_client.query_points( + collection_name=settings.get_collection_name(), + query=query_embedding, + query_filter=Filter(must=filter_conditions), + limit=limit * 2, # Get extra for deduplication + score_threshold=score_threshold, + with_payload=True, + with_vectors=False, # Don't return vectors to save bandwidth + ) + record_qdrant_operation("search", "success") + except Exception: + record_qdrant_operation("search", "error") + raise + + logger.info( + f"Qdrant returned {len(search_response.points)} results " + f"(before deduplication and access verification)" + ) + + if search_response.points: + # Log top 3 scores to help with threshold tuning + top_scores = [p.score for p in search_response.points[:3]] + logger.debug(f"Top 3 similarity scores: {top_scores}") + + # Deduplicate by document ID (multiple chunks per document) + results = await self._deduplicate_and_verify( + search_response.points, limit, nextcloud_client + ) + + logger.info( + f"Returning {len(results)} results after deduplication and access verification" + ) + if results: + result_details = [ + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in results[:5] # Show top 5 + ] + logger.debug(f"Top results: {', '.join(result_details)}") + + return results + + async def _deduplicate_and_verify( + self, + points: list[Any], + limit: int, + nextcloud_client: NextcloudClient | None, + ) -> list[SearchResult]: + """Deduplicate results by doc_id and verify access. + + Args: + points: Qdrant search results + limit: Maximum results to return + nextcloud_client: NextcloudClient for access verification (optional) + + Returns: + List of SearchResult objects + """ + seen_doc_ids = set() + results = [] + + for result in points: + doc_id = int(result.payload["doc_id"]) + doc_type = result.payload.get("doc_type", "note") + + # Skip if we've already seen this document + if doc_id in seen_doc_ids: + continue + + seen_doc_ids.add(doc_id) + + # Verify access via Nextcloud API if client provided + # Currently only supports notes + if nextcloud_client and doc_type == "note": + try: + note = await nextcloud_client.notes.get_note(doc_id) + + results.append( + SearchResult( + id=doc_id, + doc_type="note", + title=result.payload["title"], + excerpt=result.payload["excerpt"], + score=result.score, + metadata={ + "category": note.get("category", ""), + "chunk_index": result.payload["chunk_index"], + "total_chunks": result.payload["total_chunks"], + }, + ) + ) + + if len(results) >= limit: + break + + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + # User lost access or document deleted + logger.debug( + f"Skipping note {doc_id}: {e.response.status_code}" + ) + continue + else: + # Log other errors but continue processing + logger.warning( + f"Error verifying access to note {doc_id}: " + f"{e.response.status_code}" + ) + continue + else: + # No access verification, return result directly + results.append( + SearchResult( + id=doc_id, + doc_type=doc_type, + title=result.payload["title"], + excerpt=result.payload["excerpt"], + score=result.score, + metadata={ + "chunk_index": result.payload.get("chunk_index"), + "total_chunks": result.payload.get("total_chunks"), + }, + ) + ) + + if len(results) >= limit: + break + + return results From f3bdb8b8858c84ee2b66018e2eaef58e327e0eb0 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 00:25:55 +0100 Subject: [PATCH 06/17] feat: Update nc_semantic_search tool with algorithm selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements ADR-012 by adding multi-algorithm support to the MCP tool. Key changes: - Added algorithm parameter: "semantic"|"keyword"|"fuzzy"|"hybrid" (default: "hybrid") - Added weight parameters for hybrid mode configuration - Replaced direct Qdrant/embedding calls with search module abstractions - Updated docstring to describe all four algorithms - Simplified implementation: ~50 lines vs ~150 lines (67% reduction) - Better error handling for missing vector sync Algorithm selection: - semantic: Pure vector similarity (requires VECTOR_SYNC_ENABLED=true) - keyword: Token-based matching with weighted title/content scoring - fuzzy: Character overlap for typo tolerance - hybrid: RRF fusion with configurable weights (default: 0.5/0.3/0.2) Backward compatibility: - Tool name unchanged (nc_semantic_search) - New parameters have sensible defaults - Existing clients get hybrid search automatically (better than pure semantic) - search_method field in response reflects actual algorithm used Weight validation: - Performed in HybridSearchAlgorithm constructor - Must sum to ≀1.0 and all non-negative - At least one weight must be > 0 - Clear error messages on validation failure Next: Update viz pane to use same algorithms πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/server/semantic.py | 237 ++++++++++-------------- 1 file changed, 97 insertions(+), 140 deletions(-) diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index cc9b298..d0a3109 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -1,8 +1,9 @@ """Semantic search MCP tools using vector database.""" import logging +from typing import Literal -from httpx import HTTPStatusError, RequestError +from httpx import RequestError from mcp.server.fastmcp import Context, FastMCP from mcp.shared.exceptions import McpError from mcp.types import ( @@ -23,7 +24,12 @@ from nextcloud_mcp_server.models.semantic import ( ) from nextcloud_mcp_server.observability.metrics import ( instrument_tool, - record_qdrant_operation, +) +from nextcloud_mcp_server.search import ( + FuzzySearchAlgorithm, + HybridSearchAlgorithm, + KeywordSearchAlgorithm, + SemanticSearchAlgorithm, ) logger = logging.getLogger(__name__) @@ -36,187 +42,138 @@ def configure_semantic_tools(mcp: FastMCP): @require_scopes("semantic:read") @instrument_tool async def nc_semantic_search( - query: str, ctx: Context, limit: int = 10, score_threshold: float = 0.7 + query: str, + ctx: Context, + limit: int = 10, + score_threshold: float = 0.7, + algorithm: Literal["semantic", "keyword", "fuzzy", "hybrid"] = "hybrid", + semantic_weight: float = 0.5, + keyword_weight: float = 0.3, + fuzzy_weight: float = 0.2, ) -> SemanticSearchResponse: """ - Semantic search across all indexed Nextcloud apps using vector embeddings. + Search Nextcloud content using configurable algorithms. - Searches documents by meaning rather than exact keywords across notes, calendar - events, deck cards, files, and contacts. Requires vector database synchronization - to be enabled (VECTOR_SYNC_ENABLED=true). + Supports multiple search algorithms with client-configurable weighting: + - semantic: Vector similarity search (requires VECTOR_SYNC_ENABLED=true) + - keyword: Token-based matching (title matches weighted 3x) + - fuzzy: Character overlap matching (typo-tolerant) + - hybrid: Combines all algorithms using Reciprocal Rank Fusion (default) Args: query: Natural language search query limit: Maximum number of results to return (default: 10) - score_threshold: Minimum similarity score (0-1, default: 0.7) + score_threshold: Minimum similarity score for semantic/hybrid (0-1, default: 0.7) + algorithm: Search algorithm to use (default: "hybrid") + semantic_weight: Weight for semantic results in hybrid mode (default: 0.5) + keyword_weight: Weight for keyword results in hybrid mode (default: 0.3) + fuzzy_weight: Weight for fuzzy results in hybrid mode (default: 0.2) Returns: - SemanticSearchResponse with matching documents and similarity scores + SemanticSearchResponse with matching documents and relevance scores """ - from qdrant_client.models import FieldCondition, Filter, MatchValue - from nextcloud_mcp_server.config import get_settings - from nextcloud_mcp_server.embedding import get_embedding_service - from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client settings = get_settings() - - # Check if vector sync is enabled - if not settings.vector_sync_enabled: - raise McpError( - ErrorData( - code=-1, - message="Semantic search is not enabled. Set VECTOR_SYNC_ENABLED=true and ensure vector database is configured.", - ) - ) - client = await get_client(ctx) username = client.username logger.info( - f"Semantic search: query='{query}', user={username}, " + f"Search: query='{query}', user={username}, algorithm={algorithm}, " f"limit={limit}, score_threshold={score_threshold}" ) try: - # Generate embedding for query - embedding_service = get_embedding_service() - query_embedding = await embedding_service.embed(query) - logger.debug( - f"Generated embedding for query (dimension={len(query_embedding)})" - ) - - # Search Qdrant with user filtering - # Note: Currently only searching notes (doc_type="note") - # Future: Remove doc_type filter to search all apps - qdrant_client = await get_qdrant_client() - try: - search_response = await qdrant_client.query_points( - collection_name=settings.get_collection_name(), - query=query_embedding, - query_filter=Filter( - must=[ - FieldCondition( - key="user_id", - match=MatchValue(value=username), - ), - FieldCondition( - key="doc_type", - match=MatchValue(value="note"), - ), - ] - ), - limit=limit * 2, # Get extra for filtering - score_threshold=score_threshold, - with_payload=True, - with_vectors=False, # Don't return vectors to save bandwidth - ) - # Record successful search operation - record_qdrant_operation("search", "success") - except Exception: - # Record failed search operation - record_qdrant_operation("search", "error") - raise - - logger.info( - f"Qdrant returned {len(search_response.points)} results " - f"(before deduplication and access verification)" - ) - if search_response.points: - # Log top 3 scores to help with threshold tuning - top_scores = [p.score for p in search_response.points[:3]] - logger.debug(f"Top 3 similarity scores: {top_scores}") - - # Deduplicate by document ID (multiple chunks per document) - seen_doc_ids = set() - results = [] - - for result in search_response.points: - doc_id = int(result.payload["doc_id"]) - doc_type = result.payload.get("doc_type", "note") - - # Skip if we've already seen this document - if doc_id in seen_doc_ids: - continue - - seen_doc_ids.add(doc_id) - - # Verify access via Nextcloud API (dual-phase authorization) - # Currently only supports notes, will be extended to other apps - if doc_type == "note": - try: - note = await client.notes.get_note(doc_id) - - results.append( - SemanticSearchResult( - id=doc_id, - doc_type="note", - title=result.payload["title"], - category=note.get("category", ""), - excerpt=result.payload["excerpt"], - score=result.score, - chunk_index=result.payload["chunk_index"], - total_chunks=result.payload["total_chunks"], - ) + # Create appropriate algorithm instance + if algorithm == "semantic": + if not settings.vector_sync_enabled: + raise McpError( + ErrorData( + code=-1, + message="Semantic search requires VECTOR_SYNC_ENABLED=true", ) + ) + search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold) + elif algorithm == "keyword": + search_algo = KeywordSearchAlgorithm() + elif algorithm == "fuzzy": + search_algo = FuzzySearchAlgorithm() + elif algorithm == "hybrid": + if semantic_weight > 0 and not settings.vector_sync_enabled: + raise McpError( + ErrorData( + code=-1, + message="Hybrid search with semantic component requires VECTOR_SYNC_ENABLED=true", + ) + ) + search_algo = HybridSearchAlgorithm( + semantic_weight=semantic_weight, + keyword_weight=keyword_weight, + fuzzy_weight=fuzzy_weight, + ) + else: + raise McpError( + ErrorData(code=-1, message=f"Unknown algorithm: {algorithm}") + ) - if len(results) >= limit: - break - - except HTTPStatusError as e: - if e.response.status_code == 403: - # User lost access, skip this document - logger.debug(f"Skipping note {doc_id}: access denied (403)") - continue - elif e.response.status_code == 404: - # Document was deleted but not yet removed from vector DB - logger.debug( - f"Skipping note {doc_id}: not found (404), " - f"likely deleted after indexing" - ) - continue - else: - # Log other errors but continue processing - logger.warning( - f"Error verifying access to note {doc_id}: {e.response.status_code}" - ) - continue - - logger.info( - f"Returning {len(results)} results after deduplication and access verification" + # Execute search (currently limited to notes doc_type) + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type="note", + nextcloud_client=client, + score_threshold=score_threshold, ) - if results: - result_details = [ - f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] # Show top 5 - ] - logger.debug(f"Top results: {', '.join(result_details)}") + + # Convert SearchResult objects to SemanticSearchResult for response + results = [] + for r in search_results: + results.append( + SemanticSearchResult( + id=r.id, + doc_type=r.doc_type, + title=r.title, + category=r.metadata.get("category", "") if r.metadata else "", + excerpt=r.excerpt, + score=r.score, + chunk_index=r.metadata.get("chunk_index", 0) + if r.metadata + else 0, + total_chunks=r.metadata.get("total_chunks", 1) + if r.metadata + else 1, + ) + ) + + logger.info(f"Returning {len(results)} results from {algorithm} search") return SemanticSearchResponse( results=results, query=query, total_found=len(results), - search_method="semantic", + search_method=algorithm, ) except ValueError as e: - if "No embedding provider configured" in str(e): + error_msg = str(e) + if "No embedding provider configured" in error_msg: raise McpError( ErrorData( code=-1, message="Embedding service not configured. Set OLLAMA_BASE_URL environment variable.", ) ) - raise McpError(ErrorData(code=-1, message=f"Configuration error: {str(e)}")) + raise McpError( + ErrorData(code=-1, message=f"Configuration error: {error_msg}") + ) except RequestError as e: raise McpError( ErrorData(code=-1, message=f"Network error during search: {str(e)}") ) except Exception as e: - logger.error(f"Semantic search error: {e}", exc_info=True) - raise McpError( - ErrorData(code=-1, message=f"Semantic search failed: {str(e)}") - ) + logger.error(f"Search error: {e}", exc_info=True) + raise McpError(ErrorData(code=-1, message=f"Search failed: {str(e)}")) @mcp.tool() @require_scopes("semantic:read") From b5b03bfd78edebf0e4a177717b11b34226edce53 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 01:19:29 +0100 Subject: [PATCH 07/17] feat: Add multi-document Protocol with cross-app search support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements NextcloudClientProtocol for multi-document type search following user requirement that document types are not 1:1 with apps (e.g., Notes app specializes in markdown, while Files/WebDAV handles multiple file types). Key Changes: - NextcloudClientProtocol: Generic protocol with app-specific client properties - get_indexed_doc_types(): Query Qdrant for actually-indexed document types - Document dispatch: All algorithms check Qdrant before attempting access - Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF Search Algorithm Updates: - Semantic: Added _verify_document_access() with dispatch to appropriate client - Deduplication by (doc_id, doc_type) tuple - Only "note" verification implemented, others return None with info log - Keyword: Added _fetch_documents() dispatch method - Queries Qdrant for available types before fetching - Supports cross-app search when doc_type=None - Fuzzy: Same pattern as keyword search - Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed) Future-Proof Design: - File/calendar verification stubs in place - Clear logging when unsupported types found - Easy to extend when processor indexes new document types Currently Supported: - "note" documents fully implemented and tested - Other types gracefully handled (logged but skipped) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/search/__init__.py | 9 +- nextcloud_mcp_server/search/algorithms.py | 115 +++++++++++++++++- nextcloud_mcp_server/search/fuzzy.py | 91 ++++++++++---- nextcloud_mcp_server/search/hybrid.py | 9 +- nextcloud_mcp_server/search/keyword.py | 98 ++++++++++----- nextcloud_mcp_server/search/semantic.py | 138 ++++++++++++++-------- 6 files changed, 360 insertions(+), 100 deletions(-) diff --git a/nextcloud_mcp_server/search/__init__.py b/nextcloud_mcp_server/search/__init__.py index 1da5a84..d6ec32a 100644 --- a/nextcloud_mcp_server/search/__init__.py +++ b/nextcloud_mcp_server/search/__init__.py @@ -10,15 +10,22 @@ All algorithms share the same interface and can be used interchangeably by both MCP tools and the visualization pane. """ -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm from nextcloud_mcp_server.search.hybrid import HybridSearchAlgorithm from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm __all__ = [ + "NextcloudClientProtocol", "SearchAlgorithm", "SearchResult", + "get_indexed_doc_types", "SemanticSearchAlgorithm", "KeywordSearchAlgorithm", "FuzzySearchAlgorithm", diff --git a/nextcloud_mcp_server/search/algorithms.py b/nextcloud_mcp_server/search/algorithms.py index 560e113..2a7536c 100644 --- a/nextcloud_mcp_server/search/algorithms.py +++ b/nextcloud_mcp_server/search/algorithms.py @@ -2,7 +2,120 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class NextcloudClientProtocol(Protocol): + """Protocol for Nextcloud client supporting multi-document search. + + This protocol defines the interface that search algorithms need from a + Nextcloud client to access documents across different apps (Notes, Files, + Calendar, etc.). The client provides access to app-specific sub-clients + that handle the actual API calls. + + Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps. + For example, the Notes app specializes in markdown files, while Files/WebDAV + handles multiple file types. The abstraction is at the document type level. + + Search algorithms query Qdrant to determine which document types are actually + indexed before attempting to access them, enabling graceful cross-app search. + """ + + username: str + + # App-specific clients that search algorithms dispatch to + @property + def notes(self) -> Any: + """Notes client for accessing note documents.""" + ... + + @property + def webdav(self) -> Any: + """WebDAV client for accessing file documents.""" + ... + + @property + def calendar(self) -> Any: + """Calendar client for accessing event/task documents.""" + ... + + @property + def contacts(self) -> Any: + """Contacts client for accessing contact card documents.""" + ... + + @property + def deck(self) -> Any: + """Deck client for accessing deck card documents.""" + ... + + @property + def cookbook(self) -> Any: + """Cookbook client for accessing recipe documents.""" + ... + + @property + def tables(self) -> Any: + """Tables client for accessing table row documents.""" + ... + + +async def get_indexed_doc_types(user_id: str) -> set[str]: + """Query Qdrant to get actually-indexed document types for a user. + + This enables search algorithms to check which document types are available + before attempting to search/verify them, allowing graceful cross-app search. + + Args: + user_id: User ID to filter by + + Returns: + Set of document type strings (e.g., {"note", "file", "calendar"}) + + Example: + >>> types = await get_indexed_doc_types("alice") + >>> if "note" in types: + ... # Search notes + """ + import logging + + from qdrant_client.models import FieldCondition, Filter, MatchValue + + from nextcloud_mcp_server.config import get_settings + from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client + + logger = logging.getLogger(__name__) + settings = get_settings() + + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection + + # Use scroll to sample documents and extract doc_types + # Note: This could be optimized with a facet/aggregation query if Qdrant adds support + try: + scroll_results, _next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter( + must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))] + ), + limit=1000, # Sample size to discover types + with_payload=["doc_type"], + with_vectors=False, # Don't need vectors for type discovery + ) + + doc_types = { + point.payload.get("doc_type") + for point in scroll_results + if point.payload.get("doc_type") + } + + logger.debug(f"Found indexed document types for user {user_id}: {doc_types}") + return doc_types + + except Exception as e: + logger.warning(f"Failed to query Qdrant for doc_types: {e}") + return set() @dataclass diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py index 479459f..acd57d1 100644 --- a/nextcloud_mcp_server/search/fuzzy.py +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -3,8 +3,12 @@ import logging from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) logger = logging.getLogger(__name__) @@ -38,7 +42,7 @@ class FuzzySearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute fuzzy search using character overlap. @@ -67,22 +71,39 @@ class FuzzySearchAlgorithm(SearchAlgorithm): f"limit={limit}, threshold={threshold}, doc_type={doc_type}" ) - # Currently only supports notes - if doc_type and doc_type != "note": - logger.warning(f"Fuzzy search not yet implemented for doc_type={doc_type}") - return [] + # Get available document types from Qdrant + indexed_types = await get_indexed_doc_types(user_id) + logger.debug(f"Indexed document types for user: {indexed_types}") - # Fetch all notes for the user - notes = await nextcloud_client.notes.get_notes() - logger.debug(f"Fetched {len(notes)} notes for fuzzy search") + # Determine which types to search + if doc_type: + # Search specific type if requested + search_types = [doc_type] if doc_type in indexed_types else [] + if not search_types: + logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") + return [] + else: + # Search all indexed types + search_types = list(indexed_types) - # Score and filter notes - scored_notes = [] + # Fetch documents for each type and score them + all_documents = [] + for dtype in search_types: + documents = await self._fetch_documents(nextcloud_client, dtype) + for doc in documents: + doc["_doc_type"] = dtype # Tag with type + all_documents.extend(documents) + + logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search") + + # Score and filter documents + scored_results = [] query_lower = query.lower() - for note in notes: - title = note.get("title", "") - content = note.get("content", "") + for doc in all_documents: + dtype = doc.get("_doc_type", "note") + title = doc.get("title", "") + content = doc.get("content", "") # Check title match title_score = self._calculate_char_overlap(query_lower, title.lower()) @@ -100,16 +121,16 @@ class FuzzySearchAlgorithm(SearchAlgorithm): else: excerpt = self._extract_excerpt(content, max_length=200) - scored_notes.append( + scored_results.append( SearchResult( - id=note["id"], - doc_type="note", + id=doc["id"], + doc_type=dtype, title=title or "Untitled", excerpt=excerpt, score=best_score, metadata={ - "category": note.get("category", ""), - "modified": note.get("modified"), + "category": doc.get("category", ""), + "modified": doc.get("modified"), "match_location": "title" if title_score >= content_score else "content", @@ -118,8 +139,8 @@ class FuzzySearchAlgorithm(SearchAlgorithm): ) # Sort by score (descending) and limit - scored_notes.sort(key=lambda x: x.score, reverse=True) - results = scored_notes[:limit] + scored_results.sort(key=lambda x: x.score, reverse=True) + results = scored_results[:limit] logger.info(f"Fuzzy search returned {len(results)} matching notes") if results: @@ -131,6 +152,32 @@ class FuzzySearchAlgorithm(SearchAlgorithm): return results + async def _fetch_documents( + self, nextcloud_client: NextcloudClientProtocol, doc_type: str + ) -> list[dict[str, Any]]: + """Fetch documents of a specific type from Nextcloud. + + Args: + nextcloud_client: Client for API access + doc_type: Document type to fetch ("note", "file", "calendar", etc.) + + Returns: + List of document dictionaries with at minimum: id, title, content + """ + if doc_type == "note": + return await nextcloud_client.notes.get_notes() + elif doc_type == "file": + # Future: fetch files when indexed + logger.info("File documents not yet supported for fuzzy search") + return [] + elif doc_type == "calendar": + # Future: fetch calendar events when indexed + logger.info("Calendar documents not yet supported for fuzzy search") + return [] + else: + logger.warning(f"Unknown document type '{doc_type}' for fuzzy search") + return [] + def _calculate_char_overlap(self, query: str, text: str) -> float: """Calculate character overlap ratio between query and text. diff --git a/nextcloud_mcp_server/search/hybrid.py b/nextcloud_mcp_server/search/hybrid.py index a8778c8..947f1f6 100644 --- a/nextcloud_mcp_server/search/hybrid.py +++ b/nextcloud_mcp_server/search/hybrid.py @@ -5,8 +5,11 @@ import logging from collections import defaultdict from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, +) from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm @@ -82,7 +85,7 @@ class HybridSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute hybrid search using RRF to combine algorithms. diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py index 410a7a7..d4e8002 100644 --- a/nextcloud_mcp_server/search/keyword.py +++ b/nextcloud_mcp_server/search/keyword.py @@ -3,8 +3,12 @@ import logging from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) logger = logging.getLogger(__name__) @@ -32,7 +36,7 @@ class KeywordSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute keyword search using token matching. @@ -63,52 +67,66 @@ class KeywordSearchAlgorithm(SearchAlgorithm): query_tokens = self._process_query(query) logger.debug(f"Query tokens: {query_tokens}") - # Currently only supports notes - # TODO: Extend to other document types (files, calendar, etc.) - if doc_type and doc_type != "note": - logger.warning( - f"Keyword search not yet implemented for doc_type={doc_type}" - ) - return [] + # Get available document types from Qdrant + indexed_types = await get_indexed_doc_types(user_id) + logger.debug(f"Indexed document types for user: {indexed_types}") - # Fetch all notes for the user - notes = await nextcloud_client.notes.get_notes() - logger.debug(f"Fetched {len(notes)} notes for keyword search") + # Determine which types to search + if doc_type: + # Search specific type if requested + search_types = [doc_type] if doc_type in indexed_types else [] + if not search_types: + logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") + return [] + else: + # Search all indexed types + search_types = list(indexed_types) - # Score and filter notes - scored_notes = [] - for note in notes: + # Fetch documents for each type and score them + all_documents = [] + for dtype in search_types: + documents = await self._fetch_documents(nextcloud_client, dtype) + for doc in documents: + doc["_doc_type"] = dtype # Tag with type + all_documents.extend(documents) + + logger.debug(f"Fetched {len(all_documents)} total documents for keyword search") + + # Score and filter documents + scored_results = [] + for doc in all_documents: + dtype = doc.get("_doc_type", "note") score = self._calculate_score( query_tokens, - note.get("title", ""), - note.get("content", ""), + doc.get("title", ""), + doc.get("content", ""), ) if score > 0: # Only include matches # Extract excerpt with context excerpt = self._extract_excerpt( - note.get("content", ""), + doc.get("content", ""), query_tokens, max_length=200, ) - scored_notes.append( + scored_results.append( SearchResult( - id=note["id"], - doc_type="note", - title=note.get("title", "Untitled"), + id=doc["id"], + doc_type=dtype, + title=doc.get("title", "Untitled"), excerpt=excerpt, score=score, metadata={ - "category": note.get("category", ""), - "modified": note.get("modified"), + "category": doc.get("category", ""), + "modified": doc.get("modified"), }, ) ) # Sort by score (descending) and limit - scored_notes.sort(key=lambda x: x.score, reverse=True) - results = scored_notes[:limit] + scored_results.sort(key=lambda x: x.score, reverse=True) + results = scored_results[:limit] logger.info(f"Keyword search returned {len(results)} matching notes") if results: @@ -120,6 +138,32 @@ class KeywordSearchAlgorithm(SearchAlgorithm): return results + async def _fetch_documents( + self, nextcloud_client: NextcloudClientProtocol, doc_type: str + ) -> list[dict[str, Any]]: + """Fetch documents of a specific type from Nextcloud. + + Args: + nextcloud_client: Client for API access + doc_type: Document type to fetch ("note", "file", "calendar", etc.) + + Returns: + List of document dictionaries with at minimum: id, title, content + """ + if doc_type == "note": + return await nextcloud_client.notes.get_notes() + elif doc_type == "file": + # Future: fetch files when indexed + logger.info("File documents not yet supported for keyword search") + return [] + elif doc_type == "calendar": + # Future: fetch calendar events when indexed + logger.info("Calendar documents not yet supported for keyword search") + return [] + else: + logger.warning(f"Unknown document type '{doc_type}' for keyword search") + return [] + def _process_query(self, query: str) -> list[str]: """Tokenize and normalize query. diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py index c6e632d..e38b16d 100644 --- a/nextcloud_mcp_server/search/semantic.py +++ b/nextcloud_mcp_server/search/semantic.py @@ -6,11 +6,14 @@ from typing import Any from httpx import HTTPStatusError from qdrant_client.models import FieldCondition, Filter, MatchValue -from nextcloud_mcp_server.client import NextcloudClient from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.embedding import get_embedding_service from nextcloud_mcp_server.observability.metrics import record_qdrant_operation -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, +) from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -45,7 +48,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute semantic search using vector similarity. @@ -144,9 +147,13 @@ class SemanticSearchAlgorithm(SearchAlgorithm): self, points: list[Any], limit: int, - nextcloud_client: NextcloudClient | None, + nextcloud_client: NextcloudClientProtocol | None, ) -> list[SearchResult]: - """Deduplicate results by doc_id and verify access. + """Deduplicate results by (doc_id, doc_type) and verify access. + + Supports multiple document types with dispatch to appropriate client methods. + Deduplication is now by (doc_id, doc_type) tuple to handle cases where + the same ID might exist across different document types. Args: points: Qdrant search results @@ -156,58 +163,32 @@ class SemanticSearchAlgorithm(SearchAlgorithm): Returns: List of SearchResult objects """ - seen_doc_ids = set() + seen_docs = set() # Track (doc_id, doc_type) tuples results = [] for result in points: doc_id = int(result.payload["doc_id"]) doc_type = result.payload.get("doc_type", "note") + doc_key = (doc_id, doc_type) # Skip if we've already seen this document - if doc_id in seen_doc_ids: + if doc_key in seen_docs: continue - seen_doc_ids.add(doc_id) + seen_docs.add(doc_key) # Verify access via Nextcloud API if client provided - # Currently only supports notes - if nextcloud_client and doc_type == "note": - try: - note = await nextcloud_client.notes.get_note(doc_id) + # Dispatch to appropriate client based on doc_type + verified_result = None - results.append( - SearchResult( - id=doc_id, - doc_type="note", - title=result.payload["title"], - excerpt=result.payload["excerpt"], - score=result.score, - metadata={ - "category": note.get("category", ""), - "chunk_index": result.payload["chunk_index"], - "total_chunks": result.payload["total_chunks"], - }, - ) - ) + if nextcloud_client: + verified_result = await self._verify_document_access( + nextcloud_client, doc_id, doc_type, result + ) - if len(results) >= limit: - break - - except HTTPStatusError as e: - if e.response.status_code in (403, 404): - # User lost access or document deleted - logger.debug( - f"Skipping note {doc_id}: {e.response.status_code}" - ) - continue - else: - # Log other errors but continue processing - logger.warning( - f"Error verifying access to note {doc_id}: " - f"{e.response.status_code}" - ) - continue - else: + if verified_result: + results.append(verified_result) + elif not nextcloud_client: # No access verification, return result directly results.append( SearchResult( @@ -223,7 +204,72 @@ class SemanticSearchAlgorithm(SearchAlgorithm): ) ) - if len(results) >= limit: - break + if len(results) >= limit: + break return results + + async def _verify_document_access( + self, + nextcloud_client: NextcloudClientProtocol, + doc_id: int, + doc_type: str, + qdrant_result: Any, + ) -> SearchResult | None: + """Verify user has access to a document via Nextcloud API. + + Dispatches to appropriate client method based on document type. + + Args: + nextcloud_client: Client for API access + doc_id: Document ID + doc_type: Document type ("note", "file", "calendar", etc.) + qdrant_result: Original Qdrant search result + + Returns: + SearchResult if access verified, None if access denied or error + """ + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return SearchResult( + id=doc_id, + doc_type="note", + title=qdrant_result.payload["title"], + excerpt=qdrant_result.payload["excerpt"], + score=qdrant_result.score, + metadata={ + "category": note.get("category", ""), + "chunk_index": qdrant_result.payload["chunk_index"], + "total_chunks": qdrant_result.payload["total_chunks"], + }, + ) + elif doc_type == "file": + # Future: verify file access when files are indexed + logger.info( + f"File {doc_id} found in search but file verification not yet implemented" + ) + return None + elif doc_type == "calendar": + # Future: verify calendar access when calendar events are indexed + logger.info( + f"Calendar event {doc_id} found in search but calendar verification not yet implemented" + ) + return None + else: + logger.warning( + f"Unknown document type '{doc_type}' for doc_id {doc_id}" + ) + return None + + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + # User lost access or document deleted + logger.debug(f"Skipping {doc_type} {doc_id}: {e.response.status_code}") + return None + else: + # Log other errors but continue processing + logger.warning( + f"Error verifying access to {doc_type} {doc_id}: {e.response.status_code}" + ) + return None From 682923dcc86de2757f270f8707d039560934ad78 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 15 Nov 2025 00:46:11 +0000 Subject: [PATCH 08/17] =?UTF-8?q?bump:=20version=200.34.2=20=E2=86=92=200.?= =?UTF-8?q?35.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 6 ++++++ charts/nextcloud-mcp-server/Chart.yaml | 4 ++-- pyproject.toml | 2 +- uv.lock | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff81b42..da62e6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## v0.35.0 (2025-11-15) + +### Feat + +- Enable SSE transport for mcp service and update test fixtures + ## v0.34.2 (2025-11-13) ### Fix diff --git a/charts/nextcloud-mcp-server/Chart.yaml b/charts/nextcloud-mcp-server/Chart.yaml index 52c04c4..8d3ba3d 100644 --- a/charts/nextcloud-mcp-server/Chart.yaml +++ b/charts/nextcloud-mcp-server/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: nextcloud-mcp-server description: A Helm chart for Nextcloud MCP Server - enables AI assistants to interact with Nextcloud type: application -version: 0.34.2 -appVersion: "0.34.2" +version: 0.35.0 +appVersion: "0.35.0" keywords: - nextcloud - mcp diff --git a/pyproject.toml b/pyproject.toml index 3b37701..e7ce4cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "nextcloud-mcp-server" -version = "0.34.2" +version = "0.35.0" description = "Model Context Protocol (MCP) server for Nextcloud integration - enables AI assistants to interact with Nextcloud data" authors = [ {name = "Chris Coutinho", email = "chris@coutinho.io"} diff --git a/uv.lock b/uv.lock index 2c15f1b..c0e5e21 100644 --- a/uv.lock +++ b/uv.lock @@ -1053,7 +1053,7 @@ wheels = [ [[package]] name = "nextcloud-mcp-server" -version = "0.34.2" +version = "0.35.0" source = { editable = "." } dependencies = [ { name = "aiosqlite" }, From 2a078093ed2dca5119807148e3717993bd5084cf Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 01:56:41 +0100 Subject: [PATCH 09/17] refactor!: Make all search algorithms query Qdrant payload, not Nextcloud BREAKING CHANGE: Search algorithms now require Qdrant to be populated. Vector sync must be enabled and documents indexed for search to work. - Keyword and fuzzy search now query Qdrant scroll API for title/excerpt - Remove inefficient Nextcloud API fetching pattern - Add optional Nextcloud verification for security - Deduplicate by (doc_id, doc_type) tuple, keeping chunk_index=0 - Align with document processor pattern that already stores text in Qdrant --- nextcloud_mcp_server/search/fuzzy.py | 239 ++++++++++++++--------- nextcloud_mcp_server/search/keyword.py | 247 +++++++++++++++--------- nextcloud_mcp_server/server/semantic.py | 52 ++++- 3 files changed, 356 insertions(+), 182 deletions(-) diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py index acd57d1..8dcb5db 100644 --- a/nextcloud_mcp_server/search/fuzzy.py +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -1,14 +1,18 @@ -"""Fuzzy search algorithm using character overlap matching.""" +"""Fuzzy search algorithm using character overlap matching on Qdrant payload.""" import logging from typing import Any +from httpx import HTTPStatusError +from qdrant_client.models import FieldCondition, Filter, MatchValue + +from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.search.algorithms import ( NextcloudClientProtocol, SearchAlgorithm, SearchResult, - get_indexed_doc_types, ) +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -45,25 +49,24 @@ class FuzzySearchAlgorithm(SearchAlgorithm): nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: - """Execute fuzzy search using character overlap. + """Execute fuzzy search using character overlap on Qdrant payload. + + Queries Qdrant for all indexed documents, then scores based on character + overlap in title and excerpt fields. Only verifies access with Nextcloud + at the end for security. Args: query: Search query user_id: User ID for filtering limit: Maximum results to return - doc_type: Optional document type filter (currently only "note" supported) - nextcloud_client: NextcloudClient for fetching documents + doc_type: Optional document type filter (None = all types) + nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (threshold override) Returns: List of SearchResult objects ranked by character overlap score - - Raises: - ValueError: If nextcloud_client not provided """ - if not nextcloud_client: - raise ValueError("FuzzySearch requires nextcloud_client parameter") - + settings = get_settings() threshold = kwargs.get("threshold", self.threshold) logger.info( @@ -71,112 +74,176 @@ class FuzzySearchAlgorithm(SearchAlgorithm): f"limit={limit}, threshold={threshold}, doc_type={doc_type}" ) - # Get available document types from Qdrant - indexed_types = await get_indexed_doc_types(user_id) - logger.debug(f"Indexed document types for user: {indexed_types}") - - # Determine which types to search + # Build Qdrant filter + filter_conditions = [ + FieldCondition(key="user_id", match=MatchValue(value=user_id)) + ] if doc_type: - # Search specific type if requested - search_types = [doc_type] if doc_type in indexed_types else [] - if not search_types: - logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") - return [] - else: - # Search all indexed types - search_types = list(indexed_types) + filter_conditions.append( + FieldCondition(key="doc_type", match=MatchValue(value=doc_type)) + ) - # Fetch documents for each type and score them - all_documents = [] - for dtype in search_types: - documents = await self._fetch_documents(nextcloud_client, dtype) - for doc in documents: - doc["_doc_type"] = dtype # Tag with type - all_documents.extend(documents) + # Scroll through Qdrant to get all matching documents + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection - logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search") + all_points = [] + offset = None - # Score and filter documents + # Scroll through all points matching filter + while True: + scroll_result, next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter(must=filter_conditions), + limit=100, # Batch size + offset=offset, + with_payload=["doc_id", "doc_type", "title", "excerpt", "chunk_index"], + with_vectors=False, # Don't need vectors + ) + + all_points.extend(scroll_result) + + if next_offset is None: + break + offset = next_offset + + logger.debug(f"Retrieved {len(all_points)} points from Qdrant for fuzzy search") + + # Deduplicate by (doc_id, doc_type) - keep first chunk + seen_docs = {} + for point in all_points: + doc_id = int(point.payload["doc_id"]) + dtype = point.payload.get("doc_type", "note") + doc_key = (doc_id, dtype) + + chunk_idx = point.payload.get("chunk_index", 0) + if doc_key not in seen_docs or chunk_idx == 0: + seen_docs[doc_key] = point + + logger.debug(f"Deduplicated to {len(seen_docs)} unique documents") + + # Score each document based on fuzzy matches scored_results = [] query_lower = query.lower() - for doc in all_documents: - dtype = doc.get("_doc_type", "note") - title = doc.get("title", "") - content = doc.get("content", "") + for doc_key, point in seen_docs.items(): + doc_id, dtype = doc_key + title = point.payload.get("title", "") + excerpt = point.payload.get("excerpt", "") # Check title match title_score = self._calculate_char_overlap(query_lower, title.lower()) - # Check content match - content_score = self._calculate_char_overlap(query_lower, content.lower()) + # Check excerpt match + excerpt_score = self._calculate_char_overlap(query_lower, excerpt.lower()) # Use best score - best_score = max(title_score, content_score) + best_score = max(title_score, excerpt_score) if best_score >= threshold: - # Extract excerpt based on which matched better - if title_score >= content_score: - excerpt = f"Title match: {title}" - else: - excerpt = self._extract_excerpt(content, max_length=200) - + match_location = "title" if title_score >= excerpt_score else "excerpt" scored_results.append( - SearchResult( - id=doc["id"], - doc_type=dtype, - title=title or "Untitled", - excerpt=excerpt, - score=best_score, - metadata={ - "category": doc.get("category", ""), - "modified": doc.get("modified"), - "match_location": "title" - if title_score >= content_score - else "content", - }, - ) + { + "doc_id": doc_id, + "doc_type": dtype, + "title": title, + "excerpt": excerpt + if excerpt_score >= title_score + else f"Title match: {title}", + "score": best_score, + "match_location": match_location, + } ) # Sort by score (descending) and limit - scored_results.sort(key=lambda x: x.score, reverse=True) - results = scored_results[:limit] + scored_results.sort(key=lambda x: x["score"], reverse=True) + top_results = scored_results[: limit * 2] # Get extra for access verification - logger.info(f"Fuzzy search returned {len(results)} matching notes") - if results: + # Verify access with Nextcloud (optional, for security) + final_results = [] + if nextcloud_client: + for result in top_results: + verified = await self._verify_access( + nextcloud_client, result["doc_id"], result["doc_type"] + ) + if verified: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={ + **verified.get("metadata", {}), + "match_location": result["match_location"], + }, + ) + ) + if len(final_results) >= limit: + break + else: + # No verification, return results directly + for result in top_results[:limit]: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={"match_location": result["match_location"]}, + ) + ) + + logger.info(f"Fuzzy search returned {len(final_results)} matching documents") + if final_results: result_details = [ - f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in final_results[:5] ] logger.debug(f"Top fuzzy results: {', '.join(result_details)}") - return results + return final_results - async def _fetch_documents( - self, nextcloud_client: NextcloudClientProtocol, doc_type: str - ) -> list[dict[str, Any]]: - """Fetch documents of a specific type from Nextcloud. + async def _verify_access( + self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str + ) -> dict[str, Any] | None: + """Verify user has access to a document via Nextcloud API. Args: nextcloud_client: Client for API access - doc_type: Document type to fetch ("note", "file", "calendar", etc.) + doc_id: Document ID + doc_type: Document type Returns: - List of document dictionaries with at minimum: id, title, content + Dict with metadata if access verified, None otherwise """ - if doc_type == "note": - return await nextcloud_client.notes.get_notes() - elif doc_type == "file": - # Future: fetch files when indexed - logger.info("File documents not yet supported for fuzzy search") - return [] - elif doc_type == "calendar": - # Future: fetch calendar events when indexed - logger.info("Calendar documents not yet supported for fuzzy search") - return [] - else: - logger.warning(f"Unknown document type '{doc_type}' for fuzzy search") - return [] + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return { + "metadata": { + "category": note.get("category", ""), + "modified": note.get("modified"), + } + } + else: + logger.debug( + f"Skipping verification for {doc_type} {doc_id} (not implemented)" + ) + return {"metadata": {}} + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + logger.debug( + f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" + ) + return None + else: + logger.warning( + f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" + ) + return None def _calculate_char_overlap(self, query: str, text: str) -> float: """Calculate character overlap ratio between query and text. diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py index d4e8002..c98503b 100644 --- a/nextcloud_mcp_server/search/keyword.py +++ b/nextcloud_mcp_server/search/keyword.py @@ -1,14 +1,18 @@ -"""Keyword search algorithm using token-based matching (ADR-001).""" +"""Keyword search algorithm using token-based matching on Qdrant payload (ADR-001).""" import logging from typing import Any +from httpx import HTTPStatusError +from qdrant_client.models import FieldCondition, Filter, MatchValue + +from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.search.algorithms import ( NextcloudClientProtocol, SearchAlgorithm, SearchResult, - get_indexed_doc_types, ) +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -39,24 +43,24 @@ class KeywordSearchAlgorithm(SearchAlgorithm): nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: - """Execute keyword search using token matching. + """Execute keyword search using token matching on Qdrant payload. + + Queries Qdrant for all indexed documents, then scores based on token + matches in title and excerpt fields. Only verifies access with Nextcloud + at the end for security. Args: query: Search query to tokenize and match user_id: User ID for filtering limit: Maximum results to return - doc_type: Optional document type filter (currently only "note" supported) - nextcloud_client: NextcloudClient for fetching documents + doc_type: Optional document type filter (None = all types) + nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (unused) Returns: List of SearchResult objects ranked by keyword match score - - Raises: - ValueError: If nextcloud_client not provided """ - if not nextcloud_client: - raise ValueError("KeywordSearch requires nextcloud_client parameter") + settings = get_settings() logger.info( f"Keyword search: query='{query}', user={user_id}, " @@ -67,102 +71,173 @@ class KeywordSearchAlgorithm(SearchAlgorithm): query_tokens = self._process_query(query) logger.debug(f"Query tokens: {query_tokens}") - # Get available document types from Qdrant - indexed_types = await get_indexed_doc_types(user_id) - logger.debug(f"Indexed document types for user: {indexed_types}") - - # Determine which types to search + # Build Qdrant filter + filter_conditions = [ + FieldCondition(key="user_id", match=MatchValue(value=user_id)) + ] if doc_type: - # Search specific type if requested - search_types = [doc_type] if doc_type in indexed_types else [] - if not search_types: - logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") - return [] - else: - # Search all indexed types - search_types = list(indexed_types) - - # Fetch documents for each type and score them - all_documents = [] - for dtype in search_types: - documents = await self._fetch_documents(nextcloud_client, dtype) - for doc in documents: - doc["_doc_type"] = dtype # Tag with type - all_documents.extend(documents) - - logger.debug(f"Fetched {len(all_documents)} total documents for keyword search") - - # Score and filter documents - scored_results = [] - for doc in all_documents: - dtype = doc.get("_doc_type", "note") - score = self._calculate_score( - query_tokens, - doc.get("title", ""), - doc.get("content", ""), + filter_conditions.append( + FieldCondition(key="doc_type", match=MatchValue(value=doc_type)) ) - if score > 0: # Only include matches - # Extract excerpt with context - excerpt = self._extract_excerpt( - doc.get("content", ""), - query_tokens, - max_length=200, - ) + # Scroll through Qdrant to get all matching documents + # We need title and excerpt from payload for token matching + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection + all_points = [] + offset = None + + # Scroll through all points matching filter + while True: + scroll_result, next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter(must=filter_conditions), + limit=100, # Batch size + offset=offset, + with_payload=[ + "doc_id", + "doc_type", + "title", + "excerpt", + "chunk_index", + "total_chunks", + ], + with_vectors=False, # Don't need vectors for keyword search + ) + + all_points.extend(scroll_result) + + if next_offset is None: + break + offset = next_offset + + logger.debug( + f"Retrieved {len(all_points)} points from Qdrant for keyword search" + ) + + # Deduplicate by (doc_id, doc_type) - keep best chunk per document + seen_docs = {} + for point in all_points: + doc_id = int(point.payload["doc_id"]) + dtype = point.payload.get("doc_type", "note") + doc_key = (doc_id, dtype) + + # Keep first chunk (chunk_index=0) as it has the most relevant content + chunk_idx = point.payload.get("chunk_index", 0) + if doc_key not in seen_docs or chunk_idx == 0: + seen_docs[doc_key] = point + + logger.debug(f"Deduplicated to {len(seen_docs)} unique documents") + + # Score each document based on keyword matches + scored_results = [] + for doc_key, point in seen_docs.items(): + doc_id, dtype = doc_key + title = point.payload.get("title", "") + excerpt = point.payload.get("excerpt", "") + + # Calculate keyword match score + score = self._calculate_score(query_tokens, title, excerpt) + + if score > 0: # Only include matches scored_results.append( - SearchResult( - id=doc["id"], - doc_type=dtype, - title=doc.get("title", "Untitled"), - excerpt=excerpt, - score=score, - metadata={ - "category": doc.get("category", ""), - "modified": doc.get("modified"), - }, - ) + { + "doc_id": doc_id, + "doc_type": dtype, + "title": title, + "excerpt": excerpt, + "score": score, + } ) # Sort by score (descending) and limit - scored_results.sort(key=lambda x: x.score, reverse=True) - results = scored_results[:limit] + scored_results.sort(key=lambda x: x["score"], reverse=True) + top_results = scored_results[: limit * 2] # Get extra for access verification - logger.info(f"Keyword search returned {len(results)} matching notes") - if results: + # Verify access with Nextcloud (optional, for security) + final_results = [] + if nextcloud_client: + for result in top_results: + verified = await self._verify_access( + nextcloud_client, result["doc_id"], result["doc_type"] + ) + if verified: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata=verified.get("metadata", {}), + ) + ) + if len(final_results) >= limit: + break + else: + # No verification, return results directly + for result in top_results[:limit]: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={}, + ) + ) + + logger.info(f"Keyword search returned {len(final_results)} matching documents") + if final_results: result_details = [ - f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in final_results[:5] ] logger.debug(f"Top keyword results: {', '.join(result_details)}") - return results + return final_results - async def _fetch_documents( - self, nextcloud_client: NextcloudClientProtocol, doc_type: str - ) -> list[dict[str, Any]]: - """Fetch documents of a specific type from Nextcloud. + async def _verify_access( + self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str + ) -> dict[str, Any] | None: + """Verify user has access to a document via Nextcloud API. Args: nextcloud_client: Client for API access - doc_type: Document type to fetch ("note", "file", "calendar", etc.) + doc_id: Document ID + doc_type: Document type Returns: - List of document dictionaries with at minimum: id, title, content + Dict with metadata if access verified, None otherwise """ - if doc_type == "note": - return await nextcloud_client.notes.get_notes() - elif doc_type == "file": - # Future: fetch files when indexed - logger.info("File documents not yet supported for keyword search") - return [] - elif doc_type == "calendar": - # Future: fetch calendar events when indexed - logger.info("Calendar documents not yet supported for keyword search") - return [] - else: - logger.warning(f"Unknown document type '{doc_type}' for keyword search") - return [] + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return { + "metadata": { + "category": note.get("category", ""), + "modified": note.get("modified"), + } + } + # Future: Add verification for other document types + else: + logger.debug( + f"Skipping verification for {doc_type} {doc_id} (not implemented)" + ) + return {"metadata": {}} + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + logger.debug( + f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" + ) + return None + else: + logger.warning( + f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" + ) + return None def _process_query(self, query: str) -> list[str]: """Tokenize and normalize query. diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index d0a3109..d000b11 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -45,6 +45,7 @@ def configure_semantic_tools(mcp: FastMCP): query: str, ctx: Context, limit: int = 10, + doc_types: list[str] | None = None, score_threshold: float = 0.7, algorithm: Literal["semantic", "keyword", "fuzzy", "hybrid"] = "hybrid", semantic_weight: float = 0.5, @@ -52,7 +53,7 @@ def configure_semantic_tools(mcp: FastMCP): fuzzy_weight: float = 0.2, ) -> SemanticSearchResponse: """ - Search Nextcloud content using configurable algorithms. + Search Nextcloud content using configurable algorithms with cross-app support. Supports multiple search algorithms with client-configurable weighting: - semantic: Vector similarity search (requires VECTOR_SYNC_ENABLED=true) @@ -60,9 +61,13 @@ def configure_semantic_tools(mcp: FastMCP): - fuzzy: Character overlap matching (typo-tolerant) - hybrid: Combines all algorithms using Reciprocal Rank Fusion (default) + Document types are queried from the vector database to determine what's + actually indexed. Currently only "note" documents are fully supported. + Args: query: Natural language search query limit: Maximum number of results to return (default: 10) + doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default) score_threshold: Minimum similarity score for semantic/hybrid (0-1, default: 0.7) algorithm: Search algorithm to use (default: "hybrid") semantic_weight: Weight for semantic results in hybrid mode (default: 0.5) @@ -116,15 +121,42 @@ def configure_semantic_tools(mcp: FastMCP): ErrorData(code=-1, message=f"Unknown algorithm: {algorithm}") ) - # Execute search (currently limited to notes doc_type) - search_results = await search_algo.search( - query=query, - user_id=username, - limit=limit, - doc_type="note", - nextcloud_client=client, - score_threshold=score_threshold, - ) + # Execute search across requested document types + # If doc_types is None, search all indexed types (cross-app search) + # If doc_types is a list, search only those types + all_results = [] + + if doc_types is None: + # Cross-app search: search all indexed types + # Pass None to search algorithm to let it query Qdrant for available types + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type=None, # Signal to search all types + nextcloud_client=client, + score_threshold=score_threshold, + ) + all_results.extend(search_results) + else: + # Search specific document types + # For each requested type, execute search and combine results + for dtype in doc_types: + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit * 2, # Get extra for combining + doc_type=dtype, + nextcloud_client=client, + score_threshold=score_threshold, + ) + all_results.extend(search_results) + + # Sort combined results by score and limit + all_results.sort(key=lambda r: r.score, reverse=True) + all_results = all_results[:limit] + + search_results = all_results # Convert SearchResult objects to SemanticSearchResult for response results = [] From 9a62c8478f8a01d202a2b5ae859eff7f987031e4 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 02:02:57 +0100 Subject: [PATCH 10/17] feat: Implement custom PCA to remove sklearn dependency - Add custom PCA implementation using numpy eigendecomposition - Replace sklearn.decomposition.PCA with custom implementation - Maintains same API (fit, transform, fit_transform) - Supports explained_variance_ratio_ for variance analysis - Removes scikit-learn dependency from project - Add type hints and assertion for type safety --- nextcloud_mcp_server/auth/viz_routes.py | 581 ++++++++++++++++++++++++ nextcloud_mcp_server/vector/pca.py | 140 ++++++ 2 files changed, 721 insertions(+) create mode 100644 nextcloud_mcp_server/auth/viz_routes.py create mode 100644 nextcloud_mcp_server/vector/pca.py diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py new file mode 100644 index 0000000..da2f0ab --- /dev/null +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -0,0 +1,581 @@ +"""Vector visualization routes for testing search algorithms. + +Provides a web UI for users to test different search algorithms on their own +indexed documents and visualize results in 2D space using PCA. + +All processing happens server-side following ADR-012: +- Search execution via shared search/algorithms.py +- PCA dimensionality reduction (768-dim β†’ 2D) +- Only 2D coordinates + metadata sent to client +- Bandwidth-efficient (2 floats per doc vs 768) +""" + +import logging + +import numpy as np +from starlette.authentication import requires +from starlette.requests import Request +from starlette.responses import HTMLResponse, JSONResponse + +from nextcloud_mcp_server.config import get_settings +from nextcloud_mcp_server.search import ( + FuzzySearchAlgorithm, + HybridSearchAlgorithm, + KeywordSearchAlgorithm, + SemanticSearchAlgorithm, +) +from nextcloud_mcp_server.vector.pca import PCA +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client + +logger = logging.getLogger(__name__) + + +@requires("authenticated", redirect="oauth_login") +async def vector_visualization_html(request: Request) -> HTMLResponse: + """Vector visualization page with search controls and interactive plot. + + Provides UI for testing search algorithms with real-time visualization. + Requires vector sync to be enabled. + + Args: + request: Starlette request object + + Returns: + HTML page with search interface + """ + settings = get_settings() + + if not settings.vector_sync_enabled: + return HTMLResponse( + """ +
+

Vector Visualization

+
+ Vector sync is not enabled. Set VECTOR_SYNC_ENABLED=true to use this feature. +
+
+ """ + ) + + # Get user info from session + user_info = request.session.get("user_info", {}) + username = user_info.get("preferred_username", "unknown") + + html_content = f""" + + + + + + Vector Visualization - Nextcloud MCP + + + + + + +
+
+

Vector Visualization

+
+ Testing search algorithms on your indexed documents. User: {username} +
+ +
+
+
+
+ + +
+ +
+ + +
+ +
+ +
+ + + +
+
+ + + +
+
+ + + +
+
+
+ +
+
+ + +
+ +
+ + +
+ +
+ +
+
+
+
+
+ +
+
+ Executing search and computing PCA projection... +
+
+
+ +
+

Search Results ()

+ +
+
+ + + + + """ + + return HTMLResponse(content=html_content) + + +@requires("authenticated", redirect="oauth_login") +async def vector_visualization_search(request: Request) -> JSONResponse: + """Execute server-side search and return 2D coordinates + results. + + All processing happens server-side: + 1. Execute search via shared algorithm module + 2. Fetch matching vectors from Qdrant + 3. Apply PCA reduction (768-dim β†’ 2D) + 4. Return coordinates + metadata only + + Args: + request: Starlette request with query parameters + + Returns: + JSON response with coordinates_2d and results + """ + settings = get_settings() + + if not settings.vector_sync_enabled: + return JSONResponse( + {"success": False, "error": "Vector sync not enabled"}, + status_code=400, + ) + + # Get user info + user_info = request.session.get("user_info", {}) + username = user_info.get("preferred_username") + + if not username: + return JSONResponse( + {"success": False, "error": "User not authenticated"}, + status_code=401, + ) + + # Parse query parameters + query = request.query_params.get("query", "") + algorithm = request.query_params.get("algorithm", "hybrid") + limit = int(request.query_params.get("limit", "50")) + score_threshold = float(request.query_params.get("score_threshold", "0.7")) + semantic_weight = float(request.query_params.get("semantic_weight", "0.5")) + keyword_weight = float(request.query_params.get("keyword_weight", "0.3")) + fuzzy_weight = float(request.query_params.get("fuzzy_weight", "0.2")) + + logger.info( + f"Viz search: user={username}, query='{query}', " + f"algorithm={algorithm}, limit={limit}" + ) + + try: + # Get authenticated HTTP client from session + # In BasicAuth mode: uses username/password from session + # In OAuth mode: uses access token from session + from nextcloud_mcp_server.auth.userinfo_routes import ( + _get_authenticated_client_for_userinfo, + ) + from nextcloud_mcp_server.client.notes import NotesClient + + async with await _get_authenticated_client_for_userinfo(request) as http_client: + # Create NotesClient directly with authenticated HTTP client + notes_client = NotesClient(http_client, username) + + # Wrap in a minimal client object for search algorithms + # This conforms to NextcloudClientProtocol but only implements notes + class MinimalNextcloudClient: + def __init__(self, notes_client, username): + self._notes = notes_client + self.username = username + + @property + def notes(self): + return self._notes + + @property + def webdav(self): + return None + + @property + def calendar(self): + return None + + @property + def contacts(self): + return None + + @property + def deck(self): + return None + + @property + def cookbook(self): + return None + + @property + def tables(self): + return None + + nextcloud_client = MinimalNextcloudClient(notes_client, username) + + # Create search algorithm + if algorithm == "semantic": + search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold) + elif algorithm == "keyword": + search_algo = KeywordSearchAlgorithm() + elif algorithm == "fuzzy": + search_algo = FuzzySearchAlgorithm() + elif algorithm == "hybrid": + search_algo = HybridSearchAlgorithm( + semantic_weight=semantic_weight, + keyword_weight=keyword_weight, + fuzzy_weight=fuzzy_weight, + ) + else: + return JSONResponse( + {"success": False, "error": f"Unknown algorithm: {algorithm}"}, + status_code=400, + ) + + # Execute search + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type="note", + nextcloud_client=nextcloud_client, + score_threshold=score_threshold, + ) + + if not search_results: + return JSONResponse( + { + "success": True, + "results": [], + "coordinates_2d": [], + "message": "No results found", + } + ) + + # Fetch vectors for matching results from Qdrant + qdrant_client = await get_qdrant_client() + doc_ids = [r.id for r in search_results] + + # Retrieve vectors for the matching documents + from qdrant_client.models import FieldCondition, Filter, MatchAny + + points_response = await qdrant_client.scroll( + collection_name=settings.get_collection_name(), + scroll_filter=Filter( + must=[ + FieldCondition( + key="doc_id", + match=MatchAny(any=[str(doc_id) for doc_id in doc_ids]), + ), + FieldCondition( + key="user_id", + match={"value": username}, + ), + ] + ), + limit=len(doc_ids) * 2, # Account for multiple chunks per doc + with_vectors=True, + with_payload=False, + ) + + points = points_response[0] + + if not points: + return JSONResponse( + { + "success": True, + "results": [], + "coordinates_2d": [], + "message": "No vectors found for results", + } + ) + + # Extract vectors + vectors = np.array([p.vector for p in points if p.vector is not None]) + + if len(vectors) < 2: + # Not enough points for PCA + return JSONResponse( + { + "success": True, + "results": [ + { + "id": r.id, + "doc_type": r.doc_type, + "title": r.title, + "excerpt": r.excerpt, + "score": r.score, + } + for r in search_results + ], + "coordinates_2d": [[0, 0]] * len(search_results), + "message": "Not enough vectors for PCA", + } + ) + + # Apply PCA dimensionality reduction (768-dim β†’ 2D) + pca = PCA(n_components=2) + coords_2d = pca.fit_transform(vectors) + + # After fit, these attributes are guaranteed to be set + assert pca.explained_variance_ratio_ is not None + + logger.info( + f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, " + f"PC2={pca.explained_variance_ratio_[1]:.3f}" + ) + + # Map results to coordinates (use first chunk per document) + result_coords = [] + seen_doc_ids = set() + + for point, coord in zip(points, coords_2d): + if point.payload: + doc_id = int(point.payload.get("doc_id", 0)) + if doc_id not in seen_doc_ids and doc_id in doc_ids: + seen_doc_ids.add(doc_id) + result_coords.append(coord.tolist()) + + # Build response + response_results = [ + { + "id": r.id, + "doc_type": r.doc_type, + "title": r.title, + "excerpt": r.excerpt, + "score": r.score, + } + for r in search_results + ] + + return JSONResponse( + { + "success": True, + "results": response_results, + "coordinates_2d": result_coords[: len(search_results)], + "pca_variance": { + "pc1": float(pca.explained_variance_ratio_[0]), + "pc2": float(pca.explained_variance_ratio_[1]), + }, + } + ) + + except Exception as e: + logger.error(f"Viz search error: {e}", exc_info=True) + return JSONResponse( + {"success": False, "error": str(e)}, + status_code=500, + ) diff --git a/nextcloud_mcp_server/vector/pca.py b/nextcloud_mcp_server/vector/pca.py new file mode 100644 index 0000000..7f6b402 --- /dev/null +++ b/nextcloud_mcp_server/vector/pca.py @@ -0,0 +1,140 @@ +"""Custom PCA implementation for dimensionality reduction. + +Implements Principal Component Analysis without scikit-learn dependency. +Used for reducing high-dimensional embeddings (768-dim) to 2D for visualization. +""" + +import logging + +import numpy as np + +logger = logging.getLogger(__name__) + + +class PCA: + """Principal Component Analysis for dimensionality reduction. + + Simple implementation that finds principal components via eigendecomposition + of the covariance matrix. Suitable for small-to-medium datasets. + + Attributes: + n_components: Number of principal components to keep + mean_: Mean of training data (set during fit) + components_: Principal components (eigenvectors) + explained_variance_: Variance explained by each component + explained_variance_ratio_: Fraction of total variance explained + """ + + def __init__(self, n_components: int = 2): + """Initialize PCA. + + Args: + n_components: Number of components to keep (default: 2) + """ + if n_components < 1: + raise ValueError(f"n_components must be >= 1, got {n_components}") + + self.n_components = n_components + self.mean_: np.ndarray | None = None + self.components_: np.ndarray | None = None + self.explained_variance_: np.ndarray | None = None + self.explained_variance_ratio_: np.ndarray | None = None + + def fit(self, X: np.ndarray) -> "PCA": + """Fit PCA model to data. + + Args: + X: Training data of shape (n_samples, n_features) + + Returns: + self (for method chaining) + + Raises: + ValueError: If X has fewer features than n_components + """ + X = np.asarray(X) + + if X.ndim != 2: + raise ValueError(f"X must be 2D array, got shape {X.shape}") + + n_samples, n_features = X.shape + + if n_features < self.n_components: + raise ValueError( + f"n_components={self.n_components} > n_features={n_features}" + ) + + # Center data + self.mean_ = np.mean(X, axis=0) + X_centered = X - self.mean_ + + # Compute covariance matrix + # Use (X^T X) / (n-1) for numerical stability with high-dim data + cov = np.cov(X_centered.T) + + # Eigendecomposition + eigenvalues, eigenvectors = np.linalg.eigh(cov) + + # Sort by eigenvalue (descending) + idx = np.argsort(eigenvalues)[::-1] + eigenvalues = eigenvalues[idx] + eigenvectors = eigenvectors[:, idx] + + # Keep top n_components + self.components_ = eigenvectors[:, : self.n_components].T + self.explained_variance_ = eigenvalues[: self.n_components] + + # Calculate explained variance ratio + total_variance = np.sum(eigenvalues) + if total_variance > 0: + self.explained_variance_ratio_ = self.explained_variance_ / total_variance + else: + self.explained_variance_ratio_ = np.zeros(self.n_components) + + logger.debug( + f"PCA fit: {n_samples} samples, {n_features} features β†’ " + f"{self.n_components} components, " + f"explained variance: {self.explained_variance_ratio_}" + ) + + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + """Transform data to principal component space. + + Args: + X: Data to transform of shape (n_samples, n_features) + + Returns: + Transformed data of shape (n_samples, n_components) + + Raises: + ValueError: If PCA not fitted yet + """ + if self.mean_ is None or self.components_ is None: + raise ValueError("PCA not fitted yet. Call fit() first.") + + X = np.asarray(X) + + if X.ndim != 2: + raise ValueError(f"X must be 2D array, got shape {X.shape}") + + # Center using training mean + X_centered = X - self.mean_ + + # Project onto principal components + X_transformed = np.dot(X_centered, self.components_.T) + + return X_transformed + + def fit_transform(self, X: np.ndarray) -> np.ndarray: + """Fit PCA model and transform data in one step. + + Args: + X: Training data of shape (n_samples, n_features) + + Returns: + Transformed data of shape (n_samples, n_components) + """ + self.fit(X) + return self.transform(X) From 916af1c8f309030a9ec1f7f6aa4f2914245e7dfe Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 02:32:10 +0100 Subject: [PATCH 11/17] feat: Add vector visualization pane with multi-select document types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add /app/vector-viz endpoint for interactive search testing - Implement server-side PCA dimensionality reduction (768-dim β†’ 2D) - Support multi-select document type filter for cross-app search - Support all search algorithms: semantic, keyword, fuzzy, hybrid - Display 2D scatter plot of vector embeddings using Plotly - Show search results with scores and document types - Register viz routes in app.py --- nextcloud_mcp_server/app.py | 13 +++++ nextcloud_mcp_server/auth/viz_routes.py | 75 +++++++++++++++++++++---- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/nextcloud_mcp_server/app.py b/nextcloud_mcp_server/app.py index 93f0ea1..501bc16 100644 --- a/nextcloud_mcp_server/app.py +++ b/nextcloud_mcp_server/app.py @@ -1477,6 +1477,10 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None): user_info_html, vector_sync_status_fragment, ) + from nextcloud_mcp_server.auth.viz_routes import ( + vector_visualization_html, + vector_visualization_search, + ) from nextcloud_mcp_server.auth.webhook_routes import ( disable_webhook_preset, enable_webhook_preset, @@ -1496,6 +1500,15 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None): vector_sync_status_fragment, methods=["GET"], ), # /app/vector-sync/status + # Vector visualization routes + Route( + "/vector-viz", vector_visualization_html, methods=["GET"] + ), # /app/vector-viz + Route( + "/vector-viz/search", + vector_visualization_search, + methods=["GET"], + ), # /app/vector-viz/search # Webhook management routes (admin-only) Route("/webhooks", webhook_management_pane, methods=["GET"]), # /app/webhooks Route( diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py index da2f0ab..64137cd 100644 --- a/nextcloud_mcp_server/auth/viz_routes.py +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -183,6 +183,21 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: +
+ + + + Hold Ctrl/Cmd to select multiple. Select "All Types" for cross-app search. + +
+
@@ -249,6 +264,7 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: return {{ query: '', algorithm: 'hybrid', + docTypes: [''], // Default to "All Types" limit: 50, scoreThreshold: 0.7, semanticWeight: 0.5, @@ -276,6 +292,12 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: fuzzy_weight: this.fuzzyWeight, }}); + // Add doc_types parameter (filter out empty string for "All Types") + const selectedTypes = this.docTypes.filter(t => t !== ''); + if (selectedTypes.length > 0) {{ + params.append('doc_types', selectedTypes.join(',')); + }} + const response = await fetch(`/app/vector-viz/search?${{params}}`); const data = await response.json(); @@ -371,9 +393,13 @@ async def vector_visualization_search(request: Request) -> JSONResponse: keyword_weight = float(request.query_params.get("keyword_weight", "0.3")) fuzzy_weight = float(request.query_params.get("fuzzy_weight", "0.2")) + # Parse doc_types (comma-separated list, None = all types) + doc_types_param = request.query_params.get("doc_types", "") + doc_types = doc_types_param.split(",") if doc_types_param else None + logger.info( f"Viz search: user={username}, query='{query}', " - f"algorithm={algorithm}, limit={limit}" + f"algorithm={algorithm}, limit={limit}, doc_types={doc_types}" ) try: @@ -445,15 +471,44 @@ async def vector_visualization_search(request: Request) -> JSONResponse: status_code=400, ) - # Execute search - search_results = await search_algo.search( - query=query, - user_id=username, - limit=limit, - doc_type="note", - nextcloud_client=nextcloud_client, - score_threshold=score_threshold, - ) + # Execute search (supports cross-app when doc_types=None) + if doc_types is None or len(doc_types) == 0: + # Cross-app search - search all indexed types + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type=None, # Search all types + nextcloud_client=nextcloud_client, + score_threshold=score_threshold, + ) + elif len(doc_types) == 1: + # Single document type + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type=doc_types[0], + nextcloud_client=nextcloud_client, + score_threshold=score_threshold, + ) + else: + # Multiple document types - search each and combine + all_results = [] + for doc_type in doc_types: + results = await search_algo.search( + query=query, + user_id=username, + limit=limit * 2, # Get extra per type + doc_type=doc_type, + nextcloud_client=nextcloud_client, + score_threshold=score_threshold, + ) + all_results.extend(results) + + # Sort by score and limit + all_results.sort(key=lambda r: r.score, reverse=True) + search_results = all_results[:limit] if not search_results: return JSONResponse( From eb32bbbc6bd39450ec2f222f0e19b916b86f02e6 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 02:38:05 +0100 Subject: [PATCH 12/17] feat: Add Vector Viz tab to app home page - Add Vector Viz button to tab navigation - Embed viz pane in iframe for seamless integration - Only shown when vector sync is enabled --- nextcloud_mcp_server/auth/userinfo_routes.py | 23 ++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/nextcloud_mcp_server/auth/userinfo_routes.py b/nextcloud_mcp_server/auth/userinfo_routes.py index 9b9309e..a37d52b 100644 --- a/nextcloud_mcp_server/auth/userinfo_routes.py +++ b/nextcloud_mcp_server/auth/userinfo_routes.py @@ -858,6 +858,18 @@ async def user_info_html(request: Request) -> HTMLResponse: Webhooks ''' + } + { + "" + if not show_vector_sync_tab + else ''' + + ''' }
@@ -889,6 +901,17 @@ async def user_info_html(request: Request) -> HTMLResponse:
''' } + + { + "" + if not show_vector_sync_tab + else ''' + +
+ +
+ ''' + } { From 2b35dd729f249fadad3d94b7f5f5abcec6b14058 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 02:41:42 +0100 Subject: [PATCH 13/17] fix: Reorder tabs and fix viz pane session access - Move Webhooks tab to the right (User Info | Vector Sync | Vector Viz | Webhooks) - Use request.user.display_name instead of session for viz routes - Fixes session middleware error when accessing via iframe --- nextcloud_mcp_server/auth/userinfo_routes.py | 44 ++++++++++---------- nextcloud_mcp_server/auth/viz_routes.py | 16 ++++--- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/nextcloud_mcp_server/auth/userinfo_routes.py b/nextcloud_mcp_server/auth/userinfo_routes.py index a37d52b..50e17eb 100644 --- a/nextcloud_mcp_server/auth/userinfo_routes.py +++ b/nextcloud_mcp_server/auth/userinfo_routes.py @@ -846,18 +846,6 @@ async def user_info_html(request: Request) -> HTMLResponse: Vector Sync ''' - } - { - "" - if not show_webhooks_tab - else ''' - - ''' } { "" @@ -870,6 +858,18 @@ async def user_info_html(request: Request) -> HTMLResponse: Vector Viz ''' + } + { + "" + if not show_webhooks_tab + else ''' + + ''' } @@ -893,22 +893,22 @@ async def user_info_html(request: Request) -> HTMLResponse: { "" - if not show_webhooks_tab - else f''' - -
- {webhooks_tab_html} + if not show_vector_sync_tab + else ''' + +
+
''' } { "" - if not show_vector_sync_tab - else ''' - -
- + if not show_webhooks_tab + else f''' + +
+ {webhooks_tab_html}
''' } diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py index 64137cd..e7ae535 100644 --- a/nextcloud_mcp_server/auth/viz_routes.py +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -57,9 +57,12 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: """ ) - # Get user info from session - user_info = request.session.get("user_info", {}) - username = user_info.get("preferred_username", "unknown") + # Get user info from auth context + username = ( + request.user.display_name + if hasattr(request.user, "display_name") + else "unknown" + ) html_content = f""" @@ -374,9 +377,10 @@ async def vector_visualization_search(request: Request) -> JSONResponse: status_code=400, ) - # Get user info - user_info = request.session.get("user_info", {}) - username = user_info.get("preferred_username") + # Get user info from auth context + username = ( + request.user.display_name if hasattr(request.user, "display_name") else None + ) if not username: return JSONResponse( From e3153822f75ee8e737a492cd9c22fd1ca7911a2a Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 05:19:35 +0100 Subject: [PATCH 14/17] perf: Exclude vector-sync status polling from distributed tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip tracing for /app/vector-sync/status to reduce noise from HTMX polling. Metrics collection continues for this endpoint. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- docker-compose.yml | 3 +- nextcloud_mcp_server/auth/viz_routes.py | 181 ++++++++++++------ .../observability/middleware.py | 8 +- nextcloud_mcp_server/search/algorithms.py | 2 +- nextcloud_mcp_server/search/fuzzy.py | 2 +- nextcloud_mcp_server/search/keyword.py | 2 +- 6 files changed, 136 insertions(+), 62 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 38f72db..66a615c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,7 +34,7 @@ services: - ./app-hooks:/docker-entrypoint-hooks.d:ro # Mount OIDC development directory outside /var/www/html to avoid rsync conflicts # The post-installation hook will register /opt/apps as an additional app directory - - ./third_party:/opt/apps:ro + #- ./third_party:/opt/apps:ro environment: - NEXTCLOUD_TRUSTED_DOMAINS=app - NEXTCLOUD_ADMIN_USER=admin @@ -82,6 +82,7 @@ services: - NEXTCLOUD_HOST=http://app:80 - NEXTCLOUD_USERNAME=admin - NEXTCLOUD_PASSWORD=admin + - NEXTCLOUD_PUBLIC_ISSUER_URL=http://localhost:8080 # Vector sync configuration (ADR-007) - VECTOR_SYNC_ENABLED=true diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py index e7ae535..2292c8e 100644 --- a/nextcloud_mcp_server/auth/viz_routes.py +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -64,6 +64,13 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: else "unknown" ) + # Get Nextcloud host for generating links to apps + # Use public issuer URL if available (for browser-accessible links), + # otherwise fall back to NEXTCLOUD_HOST + import os + + nextcloud_host = os.getenv("NEXTCLOUD_PUBLIC_ISSUER_URL") or settings.nextcloud_host + html_content = f""" @@ -93,11 +100,15 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: box-shadow: 0 2px 4px rgba(0,0,0,0.1); }} .controls {{ - display: grid; - grid-template-columns: 1fr 1fr; - gap: 20px; margin-bottom: 20px; }} + .control-row {{ + display: grid; + grid-template-columns: 2fr 1fr auto; + gap: 12px; + margin-bottom: 12px; + align-items: end; + }} .control-group {{ margin-bottom: 15px; }} @@ -107,7 +118,7 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: font-weight: 500; color: #333; }} - input[type="text"], select {{ + input[type="text"], input[type="number"], select {{ width: 100%; padding: 8px 12px; border: 1px solid #ddd; @@ -117,6 +128,9 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: input[type="range"] {{ width: 100%; }} + select[multiple] {{ + min-height: 100px; + }} .weight-display {{ display: inline-block; min-width: 40px; @@ -136,6 +150,19 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: .btn:hover {{ background: #0052a3; }} + .btn-secondary {{ + background: #6c757d; + color: white; + border: none; + padding: 6px 12px; + border-radius: 4px; + cursor: pointer; + font-size: 13px; + margin-bottom: 12px; + }} + .btn-secondary:hover {{ + background: #5a6268; + }} #plot {{ width: 100%; height: 600px; @@ -145,11 +172,17 @@ async def vector_visualization_html(request: Request) -> HTMLResponse: padding: 40px; color: #666; }} - .weight-controls {{ - display: none; + .advanced-section {{ + margin-top: 16px; + padding: 16px; + background: #f8f9fa; + border-radius: 4px; + border: 1px solid #dee2e6; }} - .weight-controls.active {{ - display: block; + .advanced-grid {{ + display: grid; + grid-template-columns: 1fr 1fr; + gap: 20px; }} .info-box {{ background: #e3f2fd; @@ -170,15 +203,16 @@ async def vector_visualization_html(request: Request) -> HTMLResponse:
-
-
- - -
+ +
+ + +
-
- - @@ -186,56 +220,71 @@ async def vector_visualization_html(request: Request) -> HTMLResponse:
-
- - - - Hold Ctrl/Cmd to select multiple. Select "All Types" for cross-app search. - +
+
-
- +
+ +
+
+ + +
+

Advanced Options

+ +
+
+ + + + Hold Ctrl/Cmd to select multiple + +
+ +
+
+ + +
+ +
+ + +
+
+
+ + +
+ +
- +
- +
- +
- -
-
- - -
- -
- - -
- -
- -
-
@@ -251,7 +300,9 @@ async def vector_visualization_html(request: Request) -> HTMLResponse:

Search Results ()