Files
nextcloud-mcp-server/tests/integration/test_rag_openai.py
T
Chris Coutinho 208365cd3d feat: Add OpenAI provider support for embeddings and generation
Adds OpenAI provider to the unified provider architecture (ADR-015),
supporting:
- OpenAI API (api.openai.com)
- GitHub Models API (models.github.ai/inference)
- OpenAI-compatible endpoints (Fireworks, Together, etc.)

Features:
- Embedding support with text-embedding-3-small/large models
- Text generation via chat completions API
- Automatic retry with exponential backoff for rate limits
- Provider auto-detection in registry (priority after Bedrock)

Environment variables:
- OPENAI_API_KEY: API key (required)
- OPENAI_BASE_URL: Base URL override (optional)
- OPENAI_EMBEDDING_MODEL: Embedding model (default: text-embedding-3-small)
- OPENAI_GENERATION_MODEL: Generation model (default: gpt-4o-mini)

Also adds:
- Integration tests for RAG pipeline with MCP sampling
- MCP client sampling support for integration tests
- Ground truth Q&A pairs for Nextcloud User Manual

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 00:33:32 +01:00

301 lines
10 KiB
Python

"""Integration tests for RAG pipeline with OpenAI/GitHub Models API.
These tests validate the complete semantic search and MCP sampling flow using:
1. OpenAI embeddings for semantic search
2. MCP sampling for answer generation
3. Pre-indexed Nextcloud User Manual as the knowledge base
Environment Variables:
OPENAI_API_KEY: OpenAI API key or GitHub token for models.github.ai
OPENAI_BASE_URL: Base URL override (e.g., "https://models.github.ai/inference")
OPENAI_EMBEDDING_MODEL: Embedding model (default: "text-embedding-3-small")
OPENAI_GENERATION_MODEL: Generation model for sampling (default: "gpt-4o-mini")
For GitHub CI, set:
OPENAI_API_KEY: ${{ secrets.GITHUB_TOKEN }}
OPENAI_BASE_URL: https://models.github.ai/inference
OPENAI_EMBEDDING_MODEL: openai/text-embedding-3-small
OPENAI_GENERATION_MODEL: openai/gpt-4o-mini
Prerequisites:
- Nextcloud User Manual indexed in Qdrant (via vector sync)
- VECTOR_SYNC_ENABLED=true on the MCP server
"""
import json
import os
from pathlib import Path
from typing import Any, AsyncGenerator
import pytest
from mcp import ClientSession
from nextcloud_mcp_server.providers.openai import OpenAIProvider
from tests.conftest import create_mcp_client_session
from tests.integration.sampling_support import create_openai_sampling_callback
# Skip all tests if OpenAI API key not configured
pytestmark = [
pytest.mark.integration,
pytest.mark.skipif(
not os.getenv("OPENAI_API_KEY"),
reason="OPENAI_API_KEY not set - skipping OpenAI RAG tests",
),
]
# Ground truth fixture path
FIXTURES_DIR = Path(__file__).parent / "fixtures"
GROUND_TRUTH_FILE = FIXTURES_DIR / "nextcloud_manual_ground_truth.json"
@pytest.fixture(scope="module")
def ground_truth_qa():
"""Load ground truth Q&A pairs for the Nextcloud manual."""
if not GROUND_TRUTH_FILE.exists():
pytest.skip(f"Ground truth file not found: {GROUND_TRUTH_FILE}")
with open(GROUND_TRUTH_FILE) as f:
return json.load(f)
@pytest.fixture(scope="module")
async def openai_provider():
"""OpenAI provider configured from environment (embeddings only)."""
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
provider = OpenAIProvider(
api_key=api_key,
base_url=base_url,
embedding_model=embedding_model,
generation_model=None, # Embeddings only
)
yield provider
await provider.close()
@pytest.fixture(scope="module")
async def openai_generation_provider():
"""OpenAI provider configured for text generation (for sampling callback)."""
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")
generation_model = os.getenv("OPENAI_GENERATION_MODEL", "gpt-4o-mini")
# For GitHub Models API, use the prefixed model name
if base_url and "models.github.ai" in base_url:
if not generation_model.startswith("openai/"):
generation_model = f"openai/{generation_model}"
provider = OpenAIProvider(
api_key=api_key,
base_url=base_url,
embedding_model=None, # Generation only
generation_model=generation_model,
)
yield provider
await provider.close()
@pytest.fixture(scope="module")
async def nc_mcp_client_with_sampling(
anyio_backend, openai_generation_provider
) -> AsyncGenerator[ClientSession, Any]:
"""MCP client with OpenAI-based sampling support.
This fixture creates an MCP client that can handle sampling requests
from the server using OpenAI for text generation.
"""
sampling_callback = create_openai_sampling_callback(openai_generation_provider)
async for session in create_mcp_client_session(
url="http://localhost:8000/mcp",
client_name="OpenAI Sampling MCP",
sampling_callback=sampling_callback,
):
yield session
async def test_openai_embeddings_work(openai_provider: OpenAIProvider):
"""Test that OpenAI embeddings can be generated."""
embedding = await openai_provider.embed("test query about Nextcloud")
assert isinstance(embedding, list)
assert len(embedding) > 0
assert all(isinstance(x, float) for x in embedding)
# OpenAI embedding dimensions: 1536 (small) or 3072 (large)
assert len(embedding) in [1536, 3072]
async def test_semantic_search_retrieval(nc_mcp_client, ground_truth_qa):
"""Test that semantic search retrieves relevant documents from the manual.
This tests the retrieval component of RAG - ensuring that queries
return relevant chunks from the indexed Nextcloud User Manual.
"""
# Use first query from ground truth
test_case = ground_truth_qa[0] # 2FA question
query = test_case["query"]
expected_topics = test_case["expected_topics"]
# Perform semantic search via MCP tool
result = await nc_mcp_client.call_tool(
"nc_semantic_search",
arguments={
"query": query,
"limit": 5,
"score_threshold": 0.0,
},
)
assert result.isError is False, f"Tool call failed: {result}"
data = result.structuredContent
# Verify we got results
assert data["success"] is True
assert data["total_found"] > 0, f"No results for query: {query}"
assert len(data["results"]) > 0
# Check that at least one result contains expected topic keywords
all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
assert topic_found, (
f"Expected topics {expected_topics} not found in results for query: {query}"
)
async def test_semantic_search_answer_with_sampling(
nc_mcp_client_with_sampling, ground_truth_qa
):
"""Test semantic search with MCP sampling for answer generation.
This tests the full RAG pipeline:
1. Semantic search retrieves relevant documents
2. MCP sampling generates an answer from the retrieved context
3. OpenAI generates the answer via the sampling callback
Uses nc_mcp_client_with_sampling which has OpenAI-based sampling enabled.
"""
# Use the 2FA question - has clear expected answer
test_case = ground_truth_qa[0]
query = test_case["query"]
result = await nc_mcp_client_with_sampling.call_tool(
"nc_semantic_search_answer",
arguments={
"query": query,
"limit": 5,
"score_threshold": 0.0,
"max_answer_tokens": 300,
},
)
assert result.isError is False, f"Tool call failed: {result}"
data = result.structuredContent
# Verify response structure
assert data["success"] is True
assert "query" in data
assert "generated_answer" in data
assert "sources" in data
assert "search_method" in data
# Check for either successful sampling or graceful fallback
fallback_methods = {
"semantic_sampling_unsupported",
"semantic_sampling_user_declined",
"semantic_sampling_timeout",
"semantic_sampling_mcp_error",
"semantic_sampling_fallback",
}
if data["search_method"] in fallback_methods:
# Fallback mode - verify sources still returned
assert len(data["sources"]) > 0, "Expected sources even in fallback mode"
pytest.skip(
f"MCP sampling not available (method: {data['search_method']}), "
f"but retrieval succeeded with {len(data['sources'])} sources"
)
else:
# Successful sampling - verify answer quality
assert data["search_method"] == "semantic_sampling"
assert data["generated_answer"] is not None
assert len(data["generated_answer"]) > 50 # Non-trivial answer
# Check answer contains relevant content
answer_lower = data["generated_answer"].lower()
assert any(
keyword in answer_lower
for keyword in ["two-factor", "2fa", "authentication", "password"]
), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
@pytest.mark.parametrize(
"qa_index,min_expected_results",
[
(0, 1), # 2FA question
(1, 1), # File quotas question
(2, 1), # Linux installation question
(3, 1), # Windows requirements question
(4, 1), # Client apps with 2FA question
],
)
async def test_retrieval_quality_all_queries(
nc_mcp_client, ground_truth_qa, qa_index, min_expected_results
):
"""Test retrieval quality for all ground truth queries.
Validates that each query returns at least the minimum expected
number of relevant results from the Nextcloud manual.
"""
if qa_index >= len(ground_truth_qa):
pytest.skip(f"Ground truth index {qa_index} not available")
test_case = ground_truth_qa[qa_index]
query = test_case["query"]
result = await nc_mcp_client.call_tool(
"nc_semantic_search",
arguments={
"query": query,
"limit": 5,
"score_threshold": 0.0,
},
)
assert result.isError is False
data = result.structuredContent
assert data["total_found"] >= min_expected_results, (
f"Query '{query}' returned {data['total_found']} results, "
f"expected at least {min_expected_results}"
)
async def test_no_results_for_unrelated_query(nc_mcp_client):
"""Test that completely unrelated queries return low/no scores.
The Nextcloud manual shouldn't have relevant content for
quantum physics queries.
"""
result = await nc_mcp_client.call_tool(
"nc_semantic_search",
arguments={
"query": "quantum entanglement hadron collider particle physics",
"limit": 5,
"score_threshold": 0.5, # Higher threshold to filter irrelevant
},
)
assert result.isError is False
data = result.structuredContent
# Should have few or no high-scoring results
# Low score threshold means we might get some results, but they should be low quality
if data["total_found"] > 0:
# If results exist, they should have low scores
max_score = max(r["score"] for r in data["results"])
assert max_score < 0.8, f"Unexpected high score {max_score} for unrelated query"