diff --git a/.dockerignore b/.dockerignore
index 88f7234..8e0ab28 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,3 +5,4 @@
!uv.lock
!nextcloud_mcp_server/**/*.py
+!nextcloud_mcp_server/**/*.html
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7ef61c1..c920edf 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -85,4 +85,4 @@ jobs:
NEXTCLOUD_USERNAME: "admin"
NEXTCLOUD_PASSWORD: "admin"
run: |
- uv run pytest -v --log-cli-level=WARN -m smoke
+ uv run pytest -v --log-cli-level=WARN -m unit -m smoke
diff --git a/docs/ADR-014-bm25-search.md b/docs/ADR-014-bm25-search.md
index d12c6c9..045be68 100644
--- a/docs/ADR-014-bm25-search.md
+++ b/docs/ADR-014-bm25-search.md
@@ -147,7 +147,95 @@ This decision consolidates our retrieval logic, eliminates the data consistency
**Benefits Realized:**
- ✅ Consolidated architecture (single Qdrant database for both dense + sparse)
-- ✅ Native RRF fusion (database-level, more efficient)
+- ✅ Native fusion algorithms (database-level, more efficient)
- ✅ Industry-standard BM25 (replaces custom keyword search)
- ✅ Simplified codebase (removed 736 lines of legacy code)
- ✅ Better relevance (handles both semantic and keyword queries)
+- ✅ Configurable fusion methods (RRF and DBSF)
+
+---
+
+### 7. Fusion Algorithm Options
+
+**Update: 2025-11-16**
+
+The BM25 hybrid search now supports two fusion algorithms for combining dense (semantic) and sparse (BM25) search results:
+
+#### Reciprocal Rank Fusion (RRF)
+
+**Default fusion method.** RRF is a widely-used, well-established algorithm that combines rankings from multiple retrieval systems using the reciprocal rank formula:
+
+```
+RRF(doc) = Σ 1/(k + rank_i(doc))
+```
+
+where `k` is a constant (typically 60) and `rank_i(doc)` is the rank of the document in retrieval system `i`.
+
+**Characteristics:**
+- ✅ **General-purpose**: Works well across diverse query types and document collections
+- ✅ **Rank-based**: Focuses on relative rankings rather than absolute scores
+- ✅ **Established**: Well-tested, documented, and understood in IR literature
+- ✅ **Robust**: Less sensitive to score distribution differences between systems
+
+**When to use RRF:**
+- Default choice for most use cases
+- When you have mixed query types (semantic + keyword)
+- When retrieval systems have very different score ranges
+- When you want predictable, well-understood behavior
+
+#### Distribution-Based Score Fusion (DBSF)
+
+**Alternative fusion method.** DBSF normalizes scores from each retrieval system using distribution statistics before combining them:
+
+1. **Normalization**: For each query, calculates mean (μ) and standard deviation (σ) of scores
+2. **Outlier handling**: Uses μ ± 3σ as normalization bounds
+3. **Fusion**: Sums normalized scores across systems
+
+**Characteristics:**
+- ✅ **Score-aware**: Uses actual relevance scores, not just rankings
+- ✅ **Statistical**: Normalizes based on score distribution properties
+- ⚠️ **Experimental**: Newer algorithm, less battle-tested than RRF
+- ⚠️ **Sensitive**: May behave differently depending on score distributions
+
+**When to use DBSF:**
+- When retrieval systems have vastly different score ranges that RRF doesn't balance well
+- When you want to experiment with score-based (vs rank-based) fusion
+- When statistical normalization better matches your use case
+- For A/B testing against RRF to measure retrieval quality improvements
+
+#### Configuration
+
+Both fusion algorithms are exposed via the `fusion` parameter in MCP tools:
+
+```python
+# Use RRF (default)
+response = await nc_semantic_search(
+ query="async programming",
+ fusion="rrf" # Can be omitted, RRF is default
+)
+
+# Use DBSF
+response = await nc_semantic_search(
+ query="async programming",
+ fusion="dbsf"
+)
+```
+
+The `nc_semantic_search_answer` tool also supports the `fusion` parameter and passes it through to the underlying search.
+
+#### Future: Configurable Weights
+
+**Current limitation**: Neither RRF nor DBSF currently support per-system weights (e.g., 0.8 for semantic, 0.2 for BM25). This is a Qdrant platform limitation tracked in [qdrant/qdrant#6067](https://github.com/qdrant/qdrant/issues/6067).
+
+When Qdrant adds weight support, the `fusion` parameter can be extended to accept weight configurations:
+
+```python
+# Hypothetical future API
+response = await nc_semantic_search(
+ query="async programming",
+ fusion="rrf",
+ fusion_weights={"dense": 0.7, "sparse": 0.3} # Not yet implemented
+)
+```
+
+**Recommendation**: Start with RRF (default). If you encounter cases where keyword matches are under- or over-weighted, experiment with DBSF. Monitor [qdrant/qdrant#6067](https://github.com/qdrant/qdrant/issues/6067) for configurable weight support.
diff --git a/nextcloud_mcp_server/app.py b/nextcloud_mcp_server/app.py
index acdf2c6..acdb23d 100644
--- a/nextcloud_mcp_server/app.py
+++ b/nextcloud_mcp_server/app.py
@@ -1478,6 +1478,7 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None):
vector_sync_status_fragment,
)
from nextcloud_mcp_server.auth.viz_routes import (
+ chunk_context_endpoint,
vector_visualization_html,
vector_visualization_search,
)
@@ -1509,6 +1510,11 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None):
vector_visualization_search,
methods=["GET"],
), # /app/vector-viz/search
+ Route(
+ "/chunk-context",
+ chunk_context_endpoint,
+ methods=["GET"],
+ ), # /app/chunk-context
# Webhook management routes (admin-only)
Route("/webhooks", webhook_management_pane, methods=["GET"]), # /app/webhooks
Route(
diff --git a/nextcloud_mcp_server/auth/templates/vector_viz.html b/nextcloud_mcp_server/auth/templates/vector_viz.html
new file mode 100644
index 0000000..7756b74
--- /dev/null
+++ b/nextcloud_mcp_server/auth/templates/vector_viz.html
@@ -0,0 +1,323 @@
+
+
+
+
+
Vector Visualization
+
+ Testing search algorithms on your indexed documents. User: {{ username }}
+
+
+
+
+
+
+
+
+ Executing search and computing PCA projection...
+
+
+
+
+
+
+
Search Results ( )
+
+
+ Loading results...
+
+
+
+ No results found. Try a different query or adjust your search parameters.
+
+
+
+
+
+
+
+
+
+
+
+ Score: |
+ Type:
+
+
+
+
+
+
+
+
+
+
+
+ Loading chunk...
+
+
+
+
+ ...
+
+
+ ...
+
+
+
+
+
+
+
+
+
+
+
diff --git a/nextcloud_mcp_server/auth/userinfo_routes.py b/nextcloud_mcp_server/auth/userinfo_routes.py
index 3566148..335f995 100644
--- a/nextcloud_mcp_server/auth/userinfo_routes.py
+++ b/nextcloud_mcp_server/auth/userinfo_routes.py
@@ -677,12 +677,15 @@ async def user_info_html(request: Request) -> HTMLResponse:
return {{
query: '',
algorithm: 'bm25_hybrid',
+ fusion: 'rrf', // Default fusion method for BM25 Hybrid
showAdvanced: false,
docTypes: [''], // Default to "All Types"
limit: 50,
scoreThreshold: 0.0,
loading: false,
results: [],
+ expandedChunks: {{}}, // Track which chunks are expanded (result_id -> chunk data)
+ chunkLoading: {{}}, // Track loading state per result
async executeSearch() {{
this.loading = true;
@@ -696,6 +699,11 @@ async def user_info_html(request: Request) -> HTMLResponse:
score_threshold: this.scoreThreshold,
}});
+ // Add fusion parameter for BM25 Hybrid
+ if (this.algorithm === 'bm25_hybrid') {{
+ params.append('fusion', this.fusion);
+ }}
+
// Add doc_types parameter (filter out empty string for "All Types")
const selectedTypes = this.docTypes.filter(t => t !== '');
if (selectedTypes.length > 0) {{
@@ -778,6 +786,51 @@ async def user_info_html(request: Request) -> HTMLResponse:
default:
return `${{baseUrl}}`;
}}
+ }},
+
+ hasChunkPosition(result) {{
+ // Check if result has position metadata
+ return result.chunk_start_offset != null && result.chunk_end_offset != null;
+ }},
+
+ isChunkExpanded(resultKey) {{
+ return this.expandedChunks[resultKey] !== undefined;
+ }},
+
+ async toggleChunk(result) {{
+ const resultKey = `${{result.doc_type}}_${{result.id}}`;
+
+ // If already expanded, collapse
+ if (this.isChunkExpanded(resultKey)) {{
+ delete this.expandedChunks[resultKey];
+ return;
+ }}
+
+ // Otherwise, fetch and expand
+ this.chunkLoading[resultKey] = true;
+
+ try {{
+ const params = new URLSearchParams({{
+ doc_type: result.doc_type,
+ doc_id: result.id,
+ start: result.chunk_start_offset,
+ end: result.chunk_end_offset,
+ context: 500 // 500 chars before/after
+ }});
+
+ const response = await fetch(`/app/chunk-context?${{params}}`);
+ const data = await response.json();
+
+ if (data.success) {{
+ this.expandedChunks[resultKey] = data;
+ }} else {{
+ alert('Failed to load chunk: ' + data.error);
+ }}
+ }} catch (error) {{
+ alert('Error loading chunk: ' + error.message);
+ }} finally {{
+ delete this.chunkLoading[resultKey];
+ }}
}}
}}
}}
diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py
index a92a7e7..98925d0 100644
--- a/nextcloud_mcp_server/auth/viz_routes.py
+++ b/nextcloud_mcp_server/auth/viz_routes.py
@@ -12,8 +12,10 @@ All processing happens server-side following ADR-012:
import logging
import time
+from pathlib import Path
import numpy as np
+from jinja2 import Environment, FileSystemLoader
from starlette.authentication import requires
from starlette.requests import Request
from starlette.responses import HTMLResponse, JSONResponse
@@ -28,6 +30,10 @@ from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
+# Setup Jinja2 environment for templates
+_template_dir = Path(__file__).parent / "templates"
+_jinja_env = Environment(loader=FileSystemLoader(_template_dir))
+
@requires("authenticated", redirect="oauth_login")
async def vector_visualization_html(request: Request) -> HTMLResponse:
@@ -63,252 +69,9 @@ async def vector_visualization_html(request: Request) -> HTMLResponse:
else "unknown"
)
- html_content = f"""
-
-
-
-
-
Vector Visualization
-
- Testing search algorithms on your indexed documents. User: {username}
-
-
-
-
-
-
-
-
- Executing search and computing PCA projection...
-
-
-
-
-
-
-
Search Results ( )
-
-
- Loading results...
-
-
-
- No results found. Try a different query or adjust your search parameters.
-
-
-
-
-
-
-
-
-
-
-
- Score: |
- Type:
-
-
-
-
-
-
-
- """
-
+ # Load and render template
+ template = _jinja_env.get_template("vector_viz.html")
+ html_content = template.render(username=username)
return HTMLResponse(content=html_content)
@@ -352,6 +115,7 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
algorithm = request.query_params.get("algorithm", "bm25_hybrid")
limit = int(request.query_params.get("limit", "50"))
score_threshold = float(request.query_params.get("score_threshold", "0.0"))
+ fusion = request.query_params.get("fusion", "rrf") # Default to RRF
# Parse doc_types (comma-separated list, None = all types)
doc_types_param = request.query_params.get("doc_types", "")
@@ -359,7 +123,7 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
logger.info(
f"Viz search: user={username}, query='{query}', "
- f"algorithm={algorithm}, limit={limit}, doc_types={doc_types}"
+ f"algorithm={algorithm}, fusion={fusion}, limit={limit}, doc_types={doc_types}"
)
try:
@@ -377,7 +141,9 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
if algorithm == "semantic":
search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold)
elif algorithm == "bm25_hybrid":
- search_algo = BM25HybridSearchAlgorithm(score_threshold=score_threshold)
+ search_algo = BM25HybridSearchAlgorithm(
+ score_threshold=score_threshold, fusion=fusion
+ )
else:
return JSONResponse(
{"success": False, "error": f"Unknown algorithm: {algorithm}"},
@@ -552,6 +318,8 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
"title": r.title,
"excerpt": r.excerpt,
"score": r.score,
+ "chunk_start_offset": r.chunk_start_offset,
+ "chunk_end_offset": r.chunk_end_offset,
}
for r in search_results
]
@@ -594,3 +362,125 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
{"success": False, "error": str(e)},
status_code=500,
)
+
+
+@requires("authenticated", redirect="oauth_login")
+async def chunk_context_endpoint(request: Request) -> JSONResponse:
+ """Fetch chunk text with surrounding context for visualization.
+
+ This endpoint retrieves the matched chunk along with surrounding text
+ to provide context for the search result. Used by the viz pane to
+ display chunks inline.
+
+ Query parameters:
+ doc_type: Document type (e.g., "note")
+ doc_id: Document ID
+ start: Chunk start offset (character position)
+ end: Chunk end offset (character position)
+ context: Characters of context before/after (default: 500)
+
+ Returns:
+ JSON with chunk_text, before_context, after_context, and flags
+ """
+ try:
+ # Get query parameters
+ doc_type = request.query_params.get("doc_type")
+ doc_id = request.query_params.get("doc_id")
+ start_str = request.query_params.get("start")
+ end_str = request.query_params.get("end")
+ context_chars = int(request.query_params.get("context", "500"))
+
+ # Validate required parameters
+ if not all([doc_type, doc_id, start_str, end_str]):
+ return JSONResponse(
+ {
+ "success": False,
+ "error": "Missing required parameters: doc_type, doc_id, start, end",
+ },
+ status_code=400,
+ )
+
+ start = int(start_str)
+ end = int(end_str)
+
+ # Currently only support notes
+ if doc_type != "note":
+ return JSONResponse(
+ {"success": False, "error": f"Unsupported doc_type: {doc_type}"},
+ status_code=400,
+ )
+
+ # Get authenticated HTTP client and fetch note
+ from nextcloud_mcp_server.auth.userinfo_routes import (
+ _get_authenticated_client_for_userinfo,
+ )
+ from nextcloud_mcp_server.client.notes import NotesClient
+
+ # Get username from request auth
+ username = (
+ request.user.display_name
+ if hasattr(request.user, "display_name")
+ else "unknown"
+ )
+
+ # Create notes client with authenticated HTTP client
+ http_client = await _get_authenticated_client_for_userinfo(request)
+ notes_client = NotesClient(http_client, username)
+
+ # Fetch full note content
+ note = await notes_client.get_note(int(doc_id))
+ full_content = f"{note['title']}\n\n{note['content']}"
+
+ # Validate offsets
+ if start < 0 or end > len(full_content) or start >= end:
+ return JSONResponse(
+ {
+ "success": False,
+ "error": f"Invalid offsets: start={start}, end={end}, content_length={len(full_content)}",
+ },
+ status_code=400,
+ )
+
+ # Extract chunk
+ chunk_text = full_content[start:end]
+
+ # Extract context before and after
+ before_start = max(0, start - context_chars)
+ before_context = full_content[before_start:start]
+
+ after_end = min(len(full_content), end + context_chars)
+ after_context = full_content[end:after_end]
+
+ # Determine if there's more content
+ has_more_before = before_start > 0
+ has_more_after = after_end < len(full_content)
+
+ logger.info(
+ f"Fetched chunk context for {doc_type}_{doc_id}: "
+ f"chunk_len={len(chunk_text)}, before_len={len(before_context)}, "
+ f"after_len={len(after_context)}"
+ )
+
+ return JSONResponse(
+ {
+ "success": True,
+ "chunk_text": chunk_text,
+ "before_context": before_context,
+ "after_context": after_context,
+ "has_more_before": has_more_before,
+ "has_more_after": has_more_after,
+ }
+ )
+
+ except ValueError as e:
+ logger.error(f"Invalid parameter format: {e}")
+ return JSONResponse(
+ {"success": False, "error": f"Invalid parameter format: {e}"},
+ status_code=400,
+ )
+ except Exception as e:
+ logger.error(f"Chunk context error: {e}", exc_info=True)
+ return JSONResponse(
+ {"success": False, "error": str(e)},
+ status_code=500,
+ )
diff --git a/nextcloud_mcp_server/models/semantic.py b/nextcloud_mcp_server/models/semantic.py
index b8233f0..2586195 100644
--- a/nextcloud_mcp_server/models/semantic.py
+++ b/nextcloud_mcp_server/models/semantic.py
@@ -19,9 +19,22 @@ class SemanticSearchResult(BaseModel):
default="", description="Document category (notes) or location (calendar)"
)
excerpt: str = Field(description="Excerpt from matching chunk")
- score: float = Field(description="Semantic similarity score (0-1)")
+ score: float = Field(
+ description=(
+ "Relevance score (≥ 0.0, higher is better). "
+ "Score range depends on fusion method: "
+ "RRF produces scores in [0.0, 1.0], "
+ "DBSF can exceed 1.0 (sum of normalized scores from multiple systems)"
+ )
+ )
chunk_index: int = Field(description="Index of matching chunk in document")
total_chunks: int = Field(description="Total number of chunks in document")
+ chunk_start_offset: Optional[int] = Field(
+ default=None, description="Character position where chunk starts in document"
+ )
+ chunk_end_offset: Optional[int] = Field(
+ default=None, description="Character position where chunk ends in document"
+ )
class SemanticSearchResponse(BaseResponse):
diff --git a/nextcloud_mcp_server/search/algorithms.py b/nextcloud_mcp_server/search/algorithms.py
index 49bec40..c859960 100644
--- a/nextcloud_mcp_server/search/algorithms.py
+++ b/nextcloud_mcp_server/search/algorithms.py
@@ -127,8 +127,12 @@ class SearchResult:
doc_type: Document type (note, file, calendar, contact, etc.)
title: Document title
excerpt: Content excerpt showing match context
- score: Relevance score (0.0-1.0, higher is better)
+ score: Relevance score (≥ 0.0, higher is better)
+ - RRF fusion: scores in [0.0, 1.0]
+ - DBSF fusion: scores can exceed 1.0 (sum of normalized scores)
metadata: Additional algorithm-specific metadata
+ chunk_start_offset: Character position where chunk starts (None if not available)
+ chunk_end_offset: Character position where chunk ends (None if not available)
"""
id: int
@@ -137,11 +141,20 @@ class SearchResult:
excerpt: str
score: float
metadata: dict[str, Any] | None = None
+ chunk_start_offset: int | None = None
+ chunk_end_offset: int | None = None
def __post_init__(self):
- """Validate score is in valid range."""
- if not 0.0 <= self.score <= 1.0:
- raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}")
+ """Validate score is non-negative.
+
+ Note: Different fusion methods produce different score ranges:
+ - RRF (Reciprocal Rank Fusion): Bounded to [0.0, 1.0]
+ - DBSF (Distribution-Based Score Fusion): Unbounded (can exceed 1.0)
+ DBSF sums normalized scores from multiple systems, so scores can be
+ 1.5, 2.0, etc. when multiple systems agree a document is highly relevant.
+ """
+ if self.score < 0.0:
+ raise ValueError(f"Score must be non-negative, got {self.score}")
class SearchAlgorithm(ABC):
diff --git a/nextcloud_mcp_server/search/bm25_hybrid.py b/nextcloud_mcp_server/search/bm25_hybrid.py
index d8d5975..bdd3446 100644
--- a/nextcloud_mcp_server/search/bm25_hybrid.py
+++ b/nextcloud_mcp_server/search/bm25_hybrid.py
@@ -28,15 +28,27 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
eliminating the need for application-layer result merging.
"""
- def __init__(self, score_threshold: float = 0.0):
+ def __init__(self, score_threshold: float = 0.0, fusion: str = "rrf"):
"""
Initialize BM25 hybrid search algorithm.
Args:
- score_threshold: Minimum RRF score (0-1, default: 0.0 to allow RRF scoring)
- Note: RRF produces normalized scores, so threshold is typically lower
+ score_threshold: Minimum fusion score (0-1, default: 0.0 to allow fusion scoring)
+ Note: Both RRF and DBSF produce normalized scores
+ fusion: Fusion algorithm to use: "rrf" (Reciprocal Rank Fusion, default)
+ or "dbsf" (Distribution-Based Score Fusion)
+
+ Raises:
+ ValueError: If fusion is not "rrf" or "dbsf"
"""
+ if fusion not in ("rrf", "dbsf"):
+ raise ValueError(
+ f"Invalid fusion algorithm '{fusion}'. Must be 'rrf' or 'dbsf'"
+ )
+
self.score_threshold = score_threshold
+ self.fusion = models.Fusion.RRF if fusion == "rrf" else models.Fusion.DBSF
+ self.fusion_name = fusion
@property
def name(self) -> str:
@@ -78,7 +90,8 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
logger.info(
f"BM25 hybrid search: query='{query}', user={user_id}, "
- f"limit={limit}, score_threshold={score_threshold}, doc_type={doc_type}"
+ f"limit={limit}, score_threshold={score_threshold}, doc_type={doc_type}, "
+ f"fusion={self.fusion_name}"
)
# Generate dense embedding for semantic search
@@ -139,8 +152,8 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
filter=query_filter,
),
],
- # RRF fusion query (no additional query needed, just fusion)
- query=models.FusionQuery(fusion=models.Fusion.RRF),
+ # Fusion query (RRF or DBSF based on initialization)
+ query=models.FusionQuery(fusion=self.fusion),
limit=limit * 2, # Get extra for deduplication
score_threshold=score_threshold,
with_payload=True,
@@ -152,14 +165,16 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
raise
logger.info(
- f"Qdrant RRF fusion returned {len(search_response.points)} results "
+ f"Qdrant {self.fusion_name.upper()} fusion returned {len(search_response.points)} results "
f"(before deduplication)"
)
if search_response.points:
- # Log top 3 RRF scores to help with threshold tuning
+ # Log top 3 fusion scores to help with threshold tuning
top_scores = [p.score for p in search_response.points[:3]]
- logger.debug(f"Top 3 RRF fusion scores: {top_scores}")
+ logger.debug(
+ f"Top 3 {self.fusion_name.upper()} fusion scores: {top_scores}"
+ )
# Deduplicate by (doc_id, doc_type) - multiple chunks per document
seen_docs = set()
@@ -183,12 +198,14 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
doc_type=doc_type,
title=result.payload.get("title", "Untitled"),
excerpt=result.payload.get("excerpt", ""),
- score=result.score, # RRF fusion score
+ score=result.score, # Fusion score (RRF or DBSF)
metadata={
"chunk_index": result.payload.get("chunk_index"),
"total_chunks": result.payload.get("total_chunks"),
- "search_method": "bm25_hybrid_rrf",
+ "search_method": f"bm25_hybrid_{self.fusion_name}",
},
+ chunk_start_offset=result.payload.get("chunk_start_offset"),
+ chunk_end_offset=result.payload.get("chunk_end_offset"),
)
)
diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py
index 90236ac..89e9921 100644
--- a/nextcloud_mcp_server/search/semantic.py
+++ b/nextcloud_mcp_server/search/semantic.py
@@ -150,6 +150,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
"chunk_index": result.payload.get("chunk_index"),
"total_chunks": result.payload.get("total_chunks"),
},
+ chunk_start_offset=result.payload.get("chunk_start_offset"),
+ chunk_end_offset=result.payload.get("chunk_end_offset"),
)
)
diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py
index 2f8fde6..0ff76da 100644
--- a/nextcloud_mcp_server/server/semantic.py
+++ b/nextcloud_mcp_server/server/semantic.py
@@ -42,6 +42,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: int = 10,
doc_types: list[str] | None = None,
score_threshold: float = 0.0,
+ fusion: str = "rrf",
) -> SemanticSearchResponse:
"""
Search Nextcloud content using BM25 hybrid search with cross-app support.
@@ -50,7 +51,7 @@ def configure_semantic_tools(mcp: FastMCP):
- Dense semantic vectors: For conceptual similarity and natural language queries
- BM25 sparse vectors: For precise keyword matching, acronyms, and specific terms
- Results are automatically fused using Reciprocal Rank Fusion (RRF) in the
+ Results are automatically fused using the selected fusion algorithm in the
database for optimal relevance. This provides the best of both semantic
understanding and keyword precision.
@@ -61,10 +62,13 @@ def configure_semantic_tools(mcp: FastMCP):
query: Natural language or keyword search query
limit: Maximum number of results to return (default: 10)
doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default)
- score_threshold: Minimum RRF fusion score (0-1, default: 0.0 for RRF scoring)
+ score_threshold: Minimum fusion score (0-1, default: 0.0)
+ fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)
+ RRF: Good general-purpose fusion using reciprocal ranks
+ DBSF: Uses distribution-based normalization, may better balance different score ranges
Returns:
- SemanticSearchResponse with matching documents ranked by RRF fusion scores
+ SemanticSearchResponse with matching documents ranked by fusion scores
"""
from nextcloud_mcp_server.config import get_settings
@@ -74,7 +78,7 @@ def configure_semantic_tools(mcp: FastMCP):
logger.info(
f"BM25 hybrid search: query='{query}', user={username}, "
- f"limit={limit}, score_threshold={score_threshold}"
+ f"limit={limit}, score_threshold={score_threshold}, fusion={fusion}"
)
# Check that vector sync is enabled
@@ -87,8 +91,10 @@ def configure_semantic_tools(mcp: FastMCP):
)
try:
- # Create BM25 hybrid search algorithm
- search_algo = BM25HybridSearchAlgorithm(score_threshold=score_threshold)
+ # Create BM25 hybrid search algorithm with specified fusion
+ search_algo = BM25HybridSearchAlgorithm(
+ score_threshold=score_threshold, fusion=fusion
+ )
# Execute search across requested document types
# If doc_types is None, search all indexed types (cross-app search)
@@ -152,6 +158,8 @@ def configure_semantic_tools(mcp: FastMCP):
total_chunks=r.metadata.get("total_chunks", 1)
if r.metadata
else 1,
+ chunk_start_offset=r.chunk_start_offset,
+ chunk_end_offset=r.chunk_end_offset,
)
)
@@ -161,7 +169,7 @@ def configure_semantic_tools(mcp: FastMCP):
results=results,
query=query,
total_found=len(results),
- search_method="bm25_hybrid",
+ search_method=f"bm25_hybrid_{fusion}",
)
except ValueError as e:
@@ -193,6 +201,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: int = 5,
score_threshold: float = 0.7,
max_answer_tokens: int = 500,
+ fusion: str = "rrf",
) -> SamplingSearchResponse:
"""
Semantic search with LLM-generated answer using MCP sampling.
@@ -217,6 +226,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: Maximum number of documents to retrieve (default: 5)
score_threshold: Minimum similarity score 0-1 (default: 0.7)
max_answer_tokens: Maximum tokens for generated answer (default: 500)
+ fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)
Returns:
SamplingSearchResponse containing:
@@ -256,6 +266,7 @@ def configure_semantic_tools(mcp: FastMCP):
ctx=ctx,
limit=limit,
score_threshold=score_threshold,
+ fusion=fusion,
)
# 2. Handle no results case - don't waste a sampling call
diff --git a/nextcloud_mcp_server/vector/document_chunker.py b/nextcloud_mcp_server/vector/document_chunker.py
index 5855154..7c18987 100644
--- a/nextcloud_mcp_server/vector/document_chunker.py
+++ b/nextcloud_mcp_server/vector/document_chunker.py
@@ -1,10 +1,21 @@
"""Document chunking for large texts."""
import logging
+import re
+from dataclasses import dataclass
logger = logging.getLogger(__name__)
+@dataclass
+class ChunkWithPosition:
+ """A text chunk with its character position in the original document."""
+
+ text: str
+ start_offset: int # Character position where chunk starts
+ end_offset: int # Character position where chunk ends (exclusive)
+
+
class DocumentChunker:
"""Chunk large documents for optimal embedding."""
@@ -19,33 +30,66 @@ class DocumentChunker:
self.chunk_size = chunk_size
self.overlap = overlap
- def chunk_text(self, content: str) -> list[str]:
+ def chunk_text(self, content: str) -> list[ChunkWithPosition]:
"""
- Split text into overlapping chunks.
+ Split text into overlapping chunks with position tracking.
Uses simple word-based chunking with configurable overlap to preserve
- context across chunk boundaries.
+ context across chunk boundaries. Tracks character positions for each chunk.
Args:
content: Text content to chunk
Returns:
- List of text chunks (may be single item if content is small)
+ List of chunks with their character positions in the original content
"""
- # Simple word-based chunking
- words = content.split()
+ # Use regex to find all words and their positions
+ # This preserves the original spacing and allows accurate position tracking
+ word_pattern = re.compile(r"\S+")
+ word_matches = list(word_pattern.finditer(content))
- if len(words) <= self.chunk_size:
- return [content]
+ if len(word_matches) <= self.chunk_size:
+ # Single chunk - use entire content
+ return [
+ ChunkWithPosition(text=content, start_offset=0, end_offset=len(content))
+ ]
chunks = []
- start = 0
+ start_idx = 0
- while start < len(words):
- end = start + self.chunk_size
- chunk_words = words[start:end]
- chunks.append(" ".join(chunk_words))
- start = end - self.overlap
+ while start_idx < len(word_matches):
+ end_idx = min(start_idx + self.chunk_size, len(word_matches))
- logger.debug(f"Chunked document into {len(chunks)} chunks ({len(words)} words)")
+ # Get the first and last word positions
+ first_word = word_matches[start_idx]
+ last_word = word_matches[end_idx - 1]
+
+ # Extract chunk using character positions
+ start_offset = first_word.start()
+ end_offset = last_word.end()
+ chunk_text = content[start_offset:end_offset]
+
+ chunks.append(
+ ChunkWithPosition(
+ text=chunk_text, start_offset=start_offset, end_offset=end_offset
+ )
+ )
+
+ # If we've reached the end, break
+ if end_idx >= len(word_matches):
+ break
+
+ # Move to next chunk with overlap
+ next_start_idx = end_idx - self.overlap
+
+ # Safety check: ensure we're making forward progress
+ # If we're not advancing (overlap >= chunk processed), break to prevent infinite loop
+ if next_start_idx <= start_idx:
+ break
+
+ start_idx = next_start_idx
+
+ logger.debug(
+ f"Chunked document into {len(chunks)} chunks ({len(word_matches)} words)"
+ )
return chunks
diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py
index 12481f2..ba32135 100644
--- a/nextcloud_mcp_server/vector/processor.py
+++ b/nextcloud_mcp_server/vector/processor.py
@@ -233,13 +233,16 @@ async def _index_document(
)
chunks = chunker.chunk_text(content)
+ # Extract chunk texts for embedding
+ chunk_texts = [chunk.text for chunk in chunks]
+
# Generate dense embeddings (I/O bound - external API call)
embedding_service = get_embedding_service()
- dense_embeddings = await embedding_service.embed_batch(chunks)
+ dense_embeddings = await embedding_service.embed_batch(chunk_texts)
# Generate sparse embeddings (BM25 for keyword matching)
bm25_service = get_bm25_service()
- sparse_embeddings = bm25_service.encode_batch(chunks)
+ sparse_embeddings = bm25_service.encode_batch(chunk_texts)
# Prepare Qdrant points
indexed_at = int(time.time())
@@ -265,12 +268,15 @@ async def _index_document(
"doc_id": doc_task.doc_id,
"doc_type": doc_task.doc_type,
"title": title,
- "excerpt": chunk[:200],
+ "excerpt": chunk.text[:200],
"indexed_at": indexed_at,
"modified_at": doc_task.modified_at,
"etag": etag,
"chunk_index": i,
"total_chunks": len(chunks),
+ "chunk_start_offset": chunk.start_offset,
+ "chunk_end_offset": chunk.end_offset,
+ "metadata_version": 2, # v2 includes position metadata
},
)
)
diff --git a/pyproject.toml b/pyproject.toml
index 4760719..f7783b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ keywords = ["nextcloud", "mcp", "model-context-protocol", "llm", "ai", "claude",
dependencies = [
"mcp[cli] (>=1.21,<1.22)",
"httpx (>=0.28.1,<0.29.0)",
- "pillow (>=10.3.0,<12.0.0)", # Compatible with fastembed
+ "pillow (>=10.3.0,<12.0.0)", # Compatible with fastembed
"icalendar (>=6.0.0,<7.0.0)",
"pythonvcard4>=0.2.0",
"pydantic>=2.11.4",
@@ -22,7 +22,9 @@ dependencies = [
"aiosqlite>=0.20.0", # Async SQLite for refresh token storage
"authlib>=1.6.5",
"qdrant-client>=1.7.0",
- "fastembed>=0.4.2", # BM25 sparse vector embeddings for hybrid search
+ "fastembed>=0.4.2", # BM25 sparse vector embeddings for hybrid search
+ "anthropic>=0.42.0", # For RAG evaluation with Anthropic LLMs
+ "boto3>=1.35.0", # For Amazon Bedrock provider (optional)
# Observability dependencies
"prometheus-client>=0.21.0", # Prometheus metrics
"opentelemetry-api>=1.28.2", # OpenTelemetry API
@@ -32,6 +34,7 @@ dependencies = [
"opentelemetry-instrumentation-logging>=0.49b2", # Logging integration
"opentelemetry-exporter-otlp-proto-grpc>=1.28.2", # OTLP gRPC exporter
"python-json-logger>=3.2.0", # Structured JSON logging
+ "jinja2>=3.1.6",
]
classifiers = [
"Development Status :: 4 - Beta",
@@ -103,8 +106,6 @@ module-root = ""
[dependency-groups]
dev = [
- "anthropic>=0.42.0", # For RAG evaluation with Anthropic LLMs
- "boto3>=1.35.0", # For Amazon Bedrock provider (optional)
"commitizen>=4.8.2",
"datasets>=3.3.0", # For BeIR nfcorpus dataset loading
"ipython>=9.2.0",
diff --git a/tests/unit/search/__init__.py b/tests/unit/search/__init__.py
new file mode 100644
index 0000000..3ac51e6
--- /dev/null
+++ b/tests/unit/search/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for search algorithms."""
diff --git a/tests/unit/search/test_bm25_hybrid.py b/tests/unit/search/test_bm25_hybrid.py
new file mode 100644
index 0000000..a80b57b
--- /dev/null
+++ b/tests/unit/search/test_bm25_hybrid.py
@@ -0,0 +1,54 @@
+"""Unit tests for BM25 hybrid search algorithm."""
+
+import pytest
+from qdrant_client import models
+
+from nextcloud_mcp_server.search.bm25_hybrid import BM25HybridSearchAlgorithm
+
+
+@pytest.mark.unit
+def test_bm25_hybrid_initialization_default():
+ """Test BM25HybridSearchAlgorithm initializes with default RRF fusion."""
+ algo = BM25HybridSearchAlgorithm()
+
+ assert algo.score_threshold == 0.0
+ assert algo.fusion == models.Fusion.RRF
+ assert algo.fusion_name == "rrf"
+ assert algo.name == "bm25_hybrid"
+
+
+@pytest.mark.unit
+def test_bm25_hybrid_initialization_with_rrf():
+ """Test BM25HybridSearchAlgorithm initializes with explicit RRF fusion."""
+ algo = BM25HybridSearchAlgorithm(score_threshold=0.5, fusion="rrf")
+
+ assert algo.score_threshold == 0.5
+ assert algo.fusion == models.Fusion.RRF
+ assert algo.fusion_name == "rrf"
+
+
+@pytest.mark.unit
+def test_bm25_hybrid_initialization_with_dbsf():
+ """Test BM25HybridSearchAlgorithm initializes with DBSF fusion."""
+ algo = BM25HybridSearchAlgorithm(score_threshold=0.7, fusion="dbsf")
+
+ assert algo.score_threshold == 0.7
+ assert algo.fusion == models.Fusion.DBSF
+ assert algo.fusion_name == "dbsf"
+
+
+@pytest.mark.unit
+def test_bm25_hybrid_invalid_fusion_raises_error():
+ """Test BM25HybridSearchAlgorithm raises ValueError for invalid fusion."""
+ with pytest.raises(ValueError) as exc_info:
+ BM25HybridSearchAlgorithm(fusion="invalid")
+
+ assert "Invalid fusion algorithm 'invalid'" in str(exc_info.value)
+ assert "Must be 'rrf' or 'dbsf'" in str(exc_info.value)
+
+
+@pytest.mark.unit
+def test_bm25_hybrid_requires_vector_db():
+ """Test BM25HybridSearchAlgorithm reports it requires vector database."""
+ algo = BM25HybridSearchAlgorithm()
+ assert algo.requires_vector_db is True
diff --git a/tests/unit/search/test_search_result.py b/tests/unit/search/test_search_result.py
new file mode 100644
index 0000000..c9dbf0b
--- /dev/null
+++ b/tests/unit/search/test_search_result.py
@@ -0,0 +1,135 @@
+"""Unit tests for SearchResult validation."""
+
+import pytest
+
+from nextcloud_mcp_server.search.algorithms import SearchResult
+
+
+@pytest.mark.unit
+def test_search_result_rrf_score_in_range():
+ """Test SearchResult accepts RRF scores in [0.0, 1.0] range."""
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="Test excerpt",
+ score=0.85,
+ )
+
+ assert result.score == 0.85
+
+
+@pytest.mark.unit
+def test_search_result_rrf_score_at_lower_bound():
+ """Test SearchResult accepts RRF score at lower bound (0.0)."""
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="Test excerpt",
+ score=0.0,
+ )
+
+ assert result.score == 0.0
+
+
+@pytest.mark.unit
+def test_search_result_rrf_score_at_upper_bound():
+ """Test SearchResult accepts RRF score at upper bound (1.0)."""
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="Test excerpt",
+ score=1.0,
+ )
+
+ assert result.score == 1.0
+
+
+@pytest.mark.unit
+def test_search_result_dbsf_score_above_one():
+ """Test SearchResult accepts DBSF scores > 1.0.
+
+ DBSF (Distribution-Based Score Fusion) sums normalized scores from multiple
+ systems (dense semantic + sparse BM25), so scores can exceed 1.0 when both
+ systems strongly agree a document is relevant.
+ """
+ # Typical DBSF score when both systems agree
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Highly Relevant Note",
+ excerpt="Contains keywords and is semantically similar",
+ score=1.55,
+ )
+
+ assert result.score == 1.55
+
+
+@pytest.mark.unit
+def test_search_result_dbsf_score_edge_case():
+ """Test SearchResult accepts DBSF maximum theoretical score (2.0).
+
+ Maximum DBSF score with 2 systems: 1.0 (dense) + 1.0 (sparse) = 2.0
+ """
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Perfect Match",
+ excerpt="Perfect semantic and keyword match",
+ score=2.0,
+ )
+
+ assert result.score == 2.0
+
+
+@pytest.mark.unit
+def test_search_result_negative_score_raises_error():
+ """Test SearchResult rejects negative scores."""
+ with pytest.raises(ValueError) as exc_info:
+ SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="Test excerpt",
+ score=-0.1,
+ )
+
+ assert "Score must be non-negative" in str(exc_info.value)
+ assert "got -0.1" in str(exc_info.value)
+
+
+@pytest.mark.unit
+def test_search_result_with_metadata():
+ """Test SearchResult with optional metadata field."""
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="Test excerpt",
+ score=1.25,
+ metadata={"fusion_method": "dbsf", "dense_score": 0.8, "sparse_score": 0.45},
+ )
+
+ assert result.score == 1.25
+ assert result.metadata["fusion_method"] == "dbsf"
+ assert result.metadata["dense_score"] == 0.8
+ assert result.metadata["sparse_score"] == 0.45
+
+
+@pytest.mark.unit
+def test_search_result_with_chunk_offsets():
+ """Test SearchResult with chunk offset information."""
+ result = SearchResult(
+ id=1,
+ doc_type="note",
+ title="Test Note",
+ excerpt="matching chunk text",
+ score=0.9,
+ chunk_start_offset=100,
+ chunk_end_offset=500,
+ )
+
+ assert result.chunk_start_offset == 100
+ assert result.chunk_end_offset == 500
diff --git a/tests/unit/test_document_chunker.py b/tests/unit/test_document_chunker.py
new file mode 100644
index 0000000..3b46ab5
--- /dev/null
+++ b/tests/unit/test_document_chunker.py
@@ -0,0 +1,190 @@
+"""Unit tests for DocumentChunker with position tracking."""
+
+from nextcloud_mcp_server.vector.document_chunker import (
+ ChunkWithPosition,
+ DocumentChunker,
+)
+
+
+class TestDocumentChunkerPositions:
+ """Test suite for DocumentChunker position tracking functionality."""
+
+ def test_single_chunk_simple_text(self):
+ """Test that single-chunk documents return correct positions."""
+ chunker = DocumentChunker(chunk_size=512, overlap=50)
+ content = "This is a short document."
+
+ chunks = chunker.chunk_text(content)
+
+ assert len(chunks) == 1
+ assert isinstance(chunks[0], ChunkWithPosition)
+ assert chunks[0].text == content
+ assert chunks[0].start_offset == 0
+ assert chunks[0].end_offset == len(content)
+
+ def test_multiple_chunks_positions(self):
+ """Test that multi-chunk documents have correct positions."""
+ chunker = DocumentChunker(chunk_size=10, overlap=2) # Small chunks for testing
+ # Create content with exactly 30 words
+ words = [f"word{i:02d}" for i in range(30)]
+ content = " ".join(words)
+
+ chunks = chunker.chunk_text(content)
+
+ # Verify we got multiple chunks (30 words, 10 per chunk, 2 overlap = 4 chunks)
+ assert len(chunks) == 4
+
+ # Verify all chunks are ChunkWithPosition
+ for chunk in chunks:
+ assert isinstance(chunk, ChunkWithPosition)
+
+ # Verify first chunk starts at 0
+ assert chunks[0].start_offset == 0
+
+ # Verify last chunk ends at content length
+ assert chunks[-1].end_offset == len(content)
+
+ # Verify chunks are contiguous or overlap (no gaps)
+ for i in range(len(chunks) - 1):
+ # Next chunk should start at or before current chunk ends
+ assert chunks[i + 1].start_offset <= chunks[i].end_offset
+
+ # Verify we can reconstruct the content using positions
+ for chunk in chunks:
+ extracted = content[chunk.start_offset : chunk.end_offset]
+ assert extracted == chunk.text
+
+ def test_chunk_positions_with_whitespace(self):
+ """Test position tracking with various whitespace."""
+ chunker = DocumentChunker(chunk_size=5, overlap=1)
+ content = "word1 word2\n\nword3\tword4 word5 word6"
+
+ chunks = chunker.chunk_text(content)
+
+ # Verify positions correctly handle whitespace
+ for chunk in chunks:
+ extracted = content[chunk.start_offset : chunk.end_offset]
+ assert extracted == chunk.text
+ # Verify no leading/trailing whitespace unless in original
+ if chunk != chunks[0] and chunk != chunks[-1]:
+ # Middle chunks should be extracted correctly
+ assert len(chunk.text.strip()) > 0
+
+ def test_empty_content(self):
+ """Test that empty content returns empty chunk."""
+ chunker = DocumentChunker(chunk_size=512, overlap=50)
+ content = ""
+
+ chunks = chunker.chunk_text(content)
+
+ assert len(chunks) == 1
+ assert chunks[0].text == ""
+ assert chunks[0].start_offset == 0
+ assert chunks[0].end_offset == 0
+
+ def test_chunk_overlap_positions(self):
+ """Test that overlapping chunks have correct positions."""
+ chunker = DocumentChunker(chunk_size=10, overlap=3)
+ words = [f"word{i:02d}" for i in range(25)]
+ content = " ".join(words)
+
+ chunks = chunker.chunk_text(content)
+
+ # Verify overlap exists
+ for i in range(len(chunks) - 1):
+ current_chunk = chunks[i]
+ next_chunk = chunks[i + 1]
+
+ # Next chunk should start before current ends (overlap)
+ # This happens because we move back by overlap words
+ # The actual character overlap depends on word lengths
+ assert next_chunk.start_offset >= 0
+ assert current_chunk.end_offset <= len(content)
+
+ def test_unicode_content_positions(self):
+ """Test position tracking with Unicode characters."""
+ chunker = DocumentChunker(chunk_size=10, overlap=2)
+ content = "Hello 世界 こんにちは мир Привет שלום مرحبا 你好"
+
+ chunks = chunker.chunk_text(content)
+
+ # Verify all chunks extract correctly
+ for chunk in chunks:
+ extracted = content[chunk.start_offset : chunk.end_offset]
+ assert extracted == chunk.text
+
+ # Verify full coverage
+ if len(chunks) == 1:
+ assert chunks[0].start_offset == 0
+ assert chunks[0].end_offset == len(content)
+
+ def test_single_word_chunks(self):
+ """Test position tracking with single-word chunks."""
+ chunker = DocumentChunker(chunk_size=1, overlap=0)
+ content = "one two three"
+
+ chunks = chunker.chunk_text(content)
+
+ assert len(chunks) == 3
+ assert chunks[0].text == "one"
+ assert chunks[1].text == "two"
+ assert chunks[2].text == "three"
+
+ # Verify positions
+ assert content[chunks[0].start_offset : chunks[0].end_offset] == "one"
+ assert content[chunks[1].start_offset : chunks[1].end_offset] == "two"
+ assert content[chunks[2].start_offset : chunks[2].end_offset] == "three"
+
+ def test_realistic_note_content(self):
+ """Test with realistic note content similar to Nextcloud Notes."""
+ chunker = DocumentChunker(chunk_size=50, overlap=10)
+ content = """My Project Notes
+
+This is a note about my project. It contains several paragraphs of text
+that should be chunked appropriately for embedding.
+
+## Key Points
+
+- First important point with some details
+- Second point that needs to be remembered
+- Third point for future reference
+
+The document continues with more content here. We want to make sure that
+the chunking preserves context across boundaries while maintaining proper
+position tracking for each chunk.
+
+This allows us to highlight the exact chunk that matched a search query,
+which builds trust in the RAG system."""
+
+ chunks = chunker.chunk_text(content)
+
+ # Should have multiple chunks
+ assert len(chunks) > 1
+
+ # Verify all chunks
+ for chunk in chunks:
+ assert isinstance(chunk, ChunkWithPosition)
+ # Verify extraction
+ extracted = content[chunk.start_offset : chunk.end_offset]
+ assert extracted == chunk.text
+ # Verify positions are valid
+ assert chunk.start_offset >= 0
+ assert chunk.end_offset <= len(content)
+ assert chunk.start_offset < chunk.end_offset
+
+ def test_chunk_boundaries(self):
+ """Test that chunk boundaries are word-aligned."""
+ chunker = DocumentChunker(chunk_size=10, overlap=2)
+ words = [f"word{i:02d}" for i in range(30)]
+ content = " ".join(words)
+
+ chunks = chunker.chunk_text(content)
+
+ for chunk in chunks:
+ # Verify chunk text starts and ends with word characters (no split words)
+ # Unless it's the full content
+ if len(chunks) > 1:
+ # Each chunk should start with a word (not whitespace)
+ assert chunk.text[0].strip() != ""
+ # Each chunk should end with a word (not whitespace)
+ assert chunk.text[-1].strip() != ""
diff --git a/third_party/oidc b/third_party/oidc
index 9616294..5670bc7 160000
--- a/third_party/oidc
+++ b/third_party/oidc
@@ -1 +1 @@
-Subproject commit 96162949117d9325e45e06acd3bbdd0fdb20450c
+Subproject commit 5670bc7e30bd475924245086c8e29b90fbdf5454
diff --git a/uv.lock b/uv.lock
index 3e0000c..1b8225f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1861,12 +1861,15 @@ version = "0.40.0"
source = { editable = "." }
dependencies = [
{ name = "aiosqlite" },
+ { name = "anthropic" },
{ name = "authlib" },
+ { name = "boto3" },
{ name = "caldav" },
{ name = "click" },
{ name = "fastembed" },
{ name = "httpx" },
{ name = "icalendar" },
+ { name = "jinja2" },
{ name = "mcp", extra = ["cli"] },
{ name = "opentelemetry-api" },
{ name = "opentelemetry-exporter-otlp-proto-grpc" },
@@ -1885,8 +1888,6 @@ dependencies = [
[package.dev-dependencies]
dev = [
- { name = "anthropic" },
- { name = "boto3" },
{ name = "commitizen" },
{ name = "datasets" },
{ name = "ipython" },
@@ -1904,12 +1905,15 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "aiosqlite", specifier = ">=0.20.0" },
+ { name = "anthropic", specifier = ">=0.42.0" },
{ name = "authlib", specifier = ">=1.6.5" },
+ { name = "boto3", specifier = ">=1.35.0" },
{ name = "caldav", git = "https://github.com/cbcoutinho/caldav?branch=feature%2Fhttpx" },
{ name = "click", specifier = ">=8.1.8" },
{ name = "fastembed", specifier = ">=0.4.2" },
{ name = "httpx", specifier = ">=0.28.1,<0.29.0" },
{ name = "icalendar", specifier = ">=6.0.0,<7.0.0" },
+ { name = "jinja2", specifier = ">=3.1.6" },
{ name = "mcp", extras = ["cli"], specifier = ">=1.21,<1.22" },
{ name = "opentelemetry-api", specifier = ">=1.28.2" },
{ name = "opentelemetry-exporter-otlp-proto-grpc", specifier = ">=1.28.2" },
@@ -1928,8 +1932,6 @@ requires-dist = [
[package.metadata.requires-dev]
dev = [
- { name = "anthropic", specifier = ">=0.42.0" },
- { name = "boto3", specifier = ">=1.35.0" },
{ name = "commitizen", specifier = ">=4.8.2" },
{ name = "datasets", specifier = ">=3.3.0" },
{ name = "ipython", specifier = ">=9.2.0" },