"""Integration tests for RAG pipeline with OpenAI/GitHub Models API. These tests validate the complete semantic search and MCP sampling flow using: 1. OpenAI embeddings for semantic search 2. MCP sampling for answer generation 3. Pre-indexed Nextcloud User Manual as the knowledge base Environment Variables: OPENAI_API_KEY: OpenAI API key or GitHub token for models.github.ai OPENAI_BASE_URL: Base URL override (e.g., "https://models.github.ai/inference") OPENAI_EMBEDDING_MODEL: Embedding model (default: "text-embedding-3-small") OPENAI_GENERATION_MODEL: Generation model for sampling (default: "gpt-4o-mini") For GitHub CI, set: OPENAI_API_KEY: ${{ secrets.GITHUB_TOKEN }} OPENAI_BASE_URL: https://models.github.ai/inference OPENAI_EMBEDDING_MODEL: openai/text-embedding-3-small OPENAI_GENERATION_MODEL: openai/gpt-4o-mini Prerequisites: - Nextcloud User Manual indexed in Qdrant (via vector sync) - VECTOR_SYNC_ENABLED=true on the MCP server """ import json import os from pathlib import Path from typing import Any, AsyncGenerator import pytest from mcp import ClientSession from nextcloud_mcp_server.providers.openai import OpenAIProvider from tests.conftest import create_mcp_client_session from tests.integration.sampling_support import create_openai_sampling_callback # Skip all tests if OpenAI API key not configured pytestmark = [ pytest.mark.integration, pytest.mark.skipif( not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set - skipping OpenAI RAG tests", ), ] # Ground truth fixture path FIXTURES_DIR = Path(__file__).parent / "fixtures" GROUND_TRUTH_FILE = FIXTURES_DIR / "nextcloud_manual_ground_truth.json" @pytest.fixture(scope="module") def ground_truth_qa(): """Load ground truth Q&A pairs for the Nextcloud manual.""" if not GROUND_TRUTH_FILE.exists(): pytest.skip(f"Ground truth file not found: {GROUND_TRUTH_FILE}") with open(GROUND_TRUTH_FILE) as f: return json.load(f) @pytest.fixture(scope="module") async def openai_provider(): """OpenAI provider configured from environment (embeddings only).""" api_key = os.getenv("OPENAI_API_KEY") base_url = os.getenv("OPENAI_BASE_URL") embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small") provider = OpenAIProvider( api_key=api_key, base_url=base_url, embedding_model=embedding_model, generation_model=None, # Embeddings only ) yield provider await provider.close() @pytest.fixture(scope="module") async def openai_generation_provider(): """OpenAI provider configured for text generation (for sampling callback).""" api_key = os.getenv("OPENAI_API_KEY") base_url = os.getenv("OPENAI_BASE_URL") generation_model = os.getenv("OPENAI_GENERATION_MODEL", "gpt-4o-mini") # For GitHub Models API, use the prefixed model name if base_url and "models.github.ai" in base_url: if not generation_model.startswith("openai/"): generation_model = f"openai/{generation_model}" provider = OpenAIProvider( api_key=api_key, base_url=base_url, embedding_model=None, # Generation only generation_model=generation_model, ) yield provider await provider.close() @pytest.fixture(scope="module") async def nc_mcp_client_with_sampling( anyio_backend, openai_generation_provider ) -> AsyncGenerator[ClientSession, Any]: """MCP client with OpenAI-based sampling support. This fixture creates an MCP client that can handle sampling requests from the server using OpenAI for text generation. """ sampling_callback = create_openai_sampling_callback(openai_generation_provider) async for session in create_mcp_client_session( url="http://localhost:8000/mcp", client_name="OpenAI Sampling MCP", sampling_callback=sampling_callback, ): yield session async def test_openai_embeddings_work(openai_provider: OpenAIProvider): """Test that OpenAI embeddings can be generated.""" embedding = await openai_provider.embed("test query about Nextcloud") assert isinstance(embedding, list) assert len(embedding) > 0 assert all(isinstance(x, float) for x in embedding) # OpenAI embedding dimensions: 1536 (small) or 3072 (large) assert len(embedding) in [1536, 3072] async def test_semantic_search_retrieval(nc_mcp_client, ground_truth_qa): """Test that semantic search retrieves relevant documents from the manual. This tests the retrieval component of RAG - ensuring that queries return relevant chunks from the indexed Nextcloud User Manual. """ # Use first query from ground truth test_case = ground_truth_qa[0] # 2FA question query = test_case["query"] expected_topics = test_case["expected_topics"] # Perform semantic search via MCP tool result = await nc_mcp_client.call_tool( "nc_semantic_search", arguments={ "query": query, "limit": 5, "score_threshold": 0.0, }, ) assert result.isError is False, f"Tool call failed: {result}" data = result.structuredContent # Verify we got results assert data["success"] is True assert data["total_found"] > 0, f"No results for query: {query}" assert len(data["results"]) > 0 # Check that at least one result contains expected topic keywords all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]]) topic_found = any(topic.lower() in all_excerpts for topic in expected_topics) assert topic_found, ( f"Expected topics {expected_topics} not found in results for query: {query}" ) async def test_semantic_search_answer_with_sampling( nc_mcp_client_with_sampling, ground_truth_qa ): """Test semantic search with MCP sampling for answer generation. This tests the full RAG pipeline: 1. Semantic search retrieves relevant documents 2. MCP sampling generates an answer from the retrieved context 3. OpenAI generates the answer via the sampling callback Uses nc_mcp_client_with_sampling which has OpenAI-based sampling enabled. """ # Use the 2FA question - has clear expected answer test_case = ground_truth_qa[0] query = test_case["query"] result = await nc_mcp_client_with_sampling.call_tool( "nc_semantic_search_answer", arguments={ "query": query, "limit": 5, "score_threshold": 0.0, "max_answer_tokens": 300, }, ) assert result.isError is False, f"Tool call failed: {result}" data = result.structuredContent # Verify response structure assert data["success"] is True assert "query" in data assert "generated_answer" in data assert "sources" in data assert "search_method" in data # Check for either successful sampling or graceful fallback fallback_methods = { "semantic_sampling_unsupported", "semantic_sampling_user_declined", "semantic_sampling_timeout", "semantic_sampling_mcp_error", "semantic_sampling_fallback", } if data["search_method"] in fallback_methods: # Fallback mode - verify sources still returned assert len(data["sources"]) > 0, "Expected sources even in fallback mode" pytest.skip( f"MCP sampling not available (method: {data['search_method']}), " f"but retrieval succeeded with {len(data['sources'])} sources" ) else: # Successful sampling - verify answer quality assert data["search_method"] == "semantic_sampling" assert data["generated_answer"] is not None assert len(data["generated_answer"]) > 50 # Non-trivial answer # Check answer contains relevant content answer_lower = data["generated_answer"].lower() assert any( keyword in answer_lower for keyword in ["two-factor", "2fa", "authentication", "password"] ), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}" @pytest.mark.parametrize( "qa_index,min_expected_results", [ (0, 1), # 2FA question (1, 1), # File quotas question (2, 1), # Linux installation question (3, 1), # Windows requirements question (4, 1), # Client apps with 2FA question ], ) async def test_retrieval_quality_all_queries( nc_mcp_client, ground_truth_qa, qa_index, min_expected_results ): """Test retrieval quality for all ground truth queries. Validates that each query returns at least the minimum expected number of relevant results from the Nextcloud manual. """ if qa_index >= len(ground_truth_qa): pytest.skip(f"Ground truth index {qa_index} not available") test_case = ground_truth_qa[qa_index] query = test_case["query"] result = await nc_mcp_client.call_tool( "nc_semantic_search", arguments={ "query": query, "limit": 5, "score_threshold": 0.0, }, ) assert result.isError is False data = result.structuredContent assert data["total_found"] >= min_expected_results, ( f"Query '{query}' returned {data['total_found']} results, " f"expected at least {min_expected_results}" ) async def test_no_results_for_unrelated_query(nc_mcp_client): """Test that completely unrelated queries return low/no scores. The Nextcloud manual shouldn't have relevant content for quantum physics queries. """ result = await nc_mcp_client.call_tool( "nc_semantic_search", arguments={ "query": "quantum entanglement hadron collider particle physics", "limit": 5, "score_threshold": 0.5, # Higher threshold to filter irrelevant }, ) assert result.isError is False data = result.structuredContent # Should have few or no high-scoring results # Low score threshold means we might get some results, but they should be low quality if data["total_found"] > 0: # If results exist, they should have low scores max_score = max(r["score"] for r in data["results"]) assert max_score < 0.8, f"Unexpected high score {max_score} for unrelated query"