fix: Increase MCP sampling timeout to 5 minutes for slower LLMs

- Increase sampling timeout from 30s to 300s in semantic.py to accommodate slower local LLMs like Ollama - Refactor RAG integration tests to support multiple providers (ollama, openai, anthropic, bedrock) - Remove unnecessary embedding_provider fixture since MCP server handles embeddings internally - Add --provider flag via tests/integration/conftest.py - Add provider_fixtures.py with factory functions for generation providers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 05:43:15 +01:00
parent f5764c01fc
commit 5c73b85f65
7 changed files with 416 additions and 124 deletions
@@ -17,18 +17,20 @@ class AnthropicProvider(Provider):
    Note: Anthropic doesn't provide embedding models, only text generation.
    """

-    def __init__(self, api_key: str, model: str = "claude-3-5-sonnet-20241022"):
+    def __init__(
+        self, api_key: str, generation_model: str = "claude-3-5-sonnet-20241022"
+    ):
        """
        Initialize Anthropic provider.

        Args:
            api_key: Anthropic API key
-            model: Model name (e.g., "claude-3-5-sonnet-20241022")
+            generation_model: Model name (e.g., "claude-3-5-sonnet-20241022")
        """
        self.client = AsyncAnthropic(api_key=api_key)
-        self.model = model
+        self.model = generation_model

-        logger.info(f"Initialized Anthropic provider (model={model})")
+        logger.info(f"Initialized Anthropic provider (model={self.model})")

    @property
    def supports_embeddings(self) -> bool:
@@ -499,9 +499,11 @@ def configure_semantic_tools(mcp: FastMCP):
        )

        # 6. Request LLM completion via MCP sampling with timeout
+        # Note: 5 minute timeout to accommodate slower local LLMs (e.g., Ollama)
+        sampling_timeout_seconds = 300

        try:
-            with anyio.fail_after(30):
+            with anyio.fail_after(sampling_timeout_seconds):
                sampling_result = await ctx.session.create_message(
                    messages=[
                        SamplingMessage(
@@ -548,14 +550,14 @@ def configure_semantic_tools(mcp: FastMCP):

        except TimeoutError:
            logger.warning(
-                f"Sampling request timed out after 30 seconds for query: '{query}', "
+                f"Sampling request timed out after {sampling_timeout_seconds} seconds for query: '{query}', "
                f"returning search results only"
            )
            return SamplingSearchResponse(
                query=query,
                generated_answer=(
                    f"[Sampling request timed out]\n\n"
-                    f"The answer generation took too long (>30s). "
+                    f"The answer generation took too long (>{sampling_timeout_seconds}s). "
                    f"Found {len(accessible_results)} relevant documents. "
                    f"Please review the sources below or try a simpler query."
                ),
@@ -675,15 +677,22 @@ def configure_semantic_tools(mcp: FastMCP):
            # Get Qdrant client and query indexed count
            indexed_count = 0
            try:
+                from qdrant_client.models import Filter
+
                from nextcloud_mcp_server.config import get_settings
+                from nextcloud_mcp_server.vector.placeholder import (
+                    get_placeholder_filter,
+                )
                from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client

                settings = get_settings()
                qdrant_client = await get_qdrant_client()

-                # Count documents in collection
+                # Count documents in collection, excluding placeholders
+                # Placeholders are zero-vector points used to track processing state
                count_result = await qdrant_client.count(
-                    collection_name=settings.get_collection_name()
+                    collection_name=settings.get_collection_name(),
+                    count_filter=Filter(must=[get_placeholder_filter()]),
                )
                indexed_count = count_result.count