208365cd3d
Adds OpenAI provider to the unified provider architecture (ADR-015), supporting: - OpenAI API (api.openai.com) - GitHub Models API (models.github.ai/inference) - OpenAI-compatible endpoints (Fireworks, Together, etc.) Features: - Embedding support with text-embedding-3-small/large models - Text generation via chat completions API - Automatic retry with exponential backoff for rate limits - Provider auto-detection in registry (priority after Bedrock) Environment variables: - OPENAI_API_KEY: API key (required) - OPENAI_BASE_URL: Base URL override (optional) - OPENAI_EMBEDDING_MODEL: Embedding model (default: text-embedding-3-small) - OPENAI_GENERATION_MODEL: Generation model (default: gpt-4o-mini) Also adds: - Integration tests for RAG pipeline with MCP sampling - MCP client sampling support for integration tests - Ground truth Q&A pairs for Nextcloud User Manual 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
301 lines
10 KiB
Python
301 lines
10 KiB
Python
"""Integration tests for RAG pipeline with OpenAI/GitHub Models API.
|
|
|
|
These tests validate the complete semantic search and MCP sampling flow using:
|
|
1. OpenAI embeddings for semantic search
|
|
2. MCP sampling for answer generation
|
|
3. Pre-indexed Nextcloud User Manual as the knowledge base
|
|
|
|
Environment Variables:
|
|
OPENAI_API_KEY: OpenAI API key or GitHub token for models.github.ai
|
|
OPENAI_BASE_URL: Base URL override (e.g., "https://models.github.ai/inference")
|
|
OPENAI_EMBEDDING_MODEL: Embedding model (default: "text-embedding-3-small")
|
|
OPENAI_GENERATION_MODEL: Generation model for sampling (default: "gpt-4o-mini")
|
|
|
|
For GitHub CI, set:
|
|
OPENAI_API_KEY: ${{ secrets.GITHUB_TOKEN }}
|
|
OPENAI_BASE_URL: https://models.github.ai/inference
|
|
OPENAI_EMBEDDING_MODEL: openai/text-embedding-3-small
|
|
OPENAI_GENERATION_MODEL: openai/gpt-4o-mini
|
|
|
|
Prerequisites:
|
|
- Nextcloud User Manual indexed in Qdrant (via vector sync)
|
|
- VECTOR_SYNC_ENABLED=true on the MCP server
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any, AsyncGenerator
|
|
|
|
import pytest
|
|
from mcp import ClientSession
|
|
|
|
from nextcloud_mcp_server.providers.openai import OpenAIProvider
|
|
from tests.conftest import create_mcp_client_session
|
|
from tests.integration.sampling_support import create_openai_sampling_callback
|
|
|
|
# Skip all tests if OpenAI API key not configured
|
|
pytestmark = [
|
|
pytest.mark.integration,
|
|
pytest.mark.skipif(
|
|
not os.getenv("OPENAI_API_KEY"),
|
|
reason="OPENAI_API_KEY not set - skipping OpenAI RAG tests",
|
|
),
|
|
]
|
|
|
|
# Ground truth fixture path
|
|
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
GROUND_TRUTH_FILE = FIXTURES_DIR / "nextcloud_manual_ground_truth.json"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def ground_truth_qa():
|
|
"""Load ground truth Q&A pairs for the Nextcloud manual."""
|
|
if not GROUND_TRUTH_FILE.exists():
|
|
pytest.skip(f"Ground truth file not found: {GROUND_TRUTH_FILE}")
|
|
|
|
with open(GROUND_TRUTH_FILE) as f:
|
|
return json.load(f)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
async def openai_provider():
|
|
"""OpenAI provider configured from environment (embeddings only)."""
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
base_url = os.getenv("OPENAI_BASE_URL")
|
|
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
|
|
|
|
provider = OpenAIProvider(
|
|
api_key=api_key,
|
|
base_url=base_url,
|
|
embedding_model=embedding_model,
|
|
generation_model=None, # Embeddings only
|
|
)
|
|
|
|
yield provider
|
|
await provider.close()
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
async def openai_generation_provider():
|
|
"""OpenAI provider configured for text generation (for sampling callback)."""
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
base_url = os.getenv("OPENAI_BASE_URL")
|
|
generation_model = os.getenv("OPENAI_GENERATION_MODEL", "gpt-4o-mini")
|
|
|
|
# For GitHub Models API, use the prefixed model name
|
|
if base_url and "models.github.ai" in base_url:
|
|
if not generation_model.startswith("openai/"):
|
|
generation_model = f"openai/{generation_model}"
|
|
|
|
provider = OpenAIProvider(
|
|
api_key=api_key,
|
|
base_url=base_url,
|
|
embedding_model=None, # Generation only
|
|
generation_model=generation_model,
|
|
)
|
|
|
|
yield provider
|
|
await provider.close()
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
async def nc_mcp_client_with_sampling(
|
|
anyio_backend, openai_generation_provider
|
|
) -> AsyncGenerator[ClientSession, Any]:
|
|
"""MCP client with OpenAI-based sampling support.
|
|
|
|
This fixture creates an MCP client that can handle sampling requests
|
|
from the server using OpenAI for text generation.
|
|
"""
|
|
sampling_callback = create_openai_sampling_callback(openai_generation_provider)
|
|
|
|
async for session in create_mcp_client_session(
|
|
url="http://localhost:8000/mcp",
|
|
client_name="OpenAI Sampling MCP",
|
|
sampling_callback=sampling_callback,
|
|
):
|
|
yield session
|
|
|
|
|
|
async def test_openai_embeddings_work(openai_provider: OpenAIProvider):
|
|
"""Test that OpenAI embeddings can be generated."""
|
|
embedding = await openai_provider.embed("test query about Nextcloud")
|
|
|
|
assert isinstance(embedding, list)
|
|
assert len(embedding) > 0
|
|
assert all(isinstance(x, float) for x in embedding)
|
|
# OpenAI embedding dimensions: 1536 (small) or 3072 (large)
|
|
assert len(embedding) in [1536, 3072]
|
|
|
|
|
|
async def test_semantic_search_retrieval(nc_mcp_client, ground_truth_qa):
|
|
"""Test that semantic search retrieves relevant documents from the manual.
|
|
|
|
This tests the retrieval component of RAG - ensuring that queries
|
|
return relevant chunks from the indexed Nextcloud User Manual.
|
|
"""
|
|
# Use first query from ground truth
|
|
test_case = ground_truth_qa[0] # 2FA question
|
|
query = test_case["query"]
|
|
expected_topics = test_case["expected_topics"]
|
|
|
|
# Perform semantic search via MCP tool
|
|
result = await nc_mcp_client.call_tool(
|
|
"nc_semantic_search",
|
|
arguments={
|
|
"query": query,
|
|
"limit": 5,
|
|
"score_threshold": 0.0,
|
|
},
|
|
)
|
|
|
|
assert result.isError is False, f"Tool call failed: {result}"
|
|
data = result.structuredContent
|
|
|
|
# Verify we got results
|
|
assert data["success"] is True
|
|
assert data["total_found"] > 0, f"No results for query: {query}"
|
|
assert len(data["results"]) > 0
|
|
|
|
# Check that at least one result contains expected topic keywords
|
|
all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
|
|
topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
|
|
assert topic_found, (
|
|
f"Expected topics {expected_topics} not found in results for query: {query}"
|
|
)
|
|
|
|
|
|
async def test_semantic_search_answer_with_sampling(
|
|
nc_mcp_client_with_sampling, ground_truth_qa
|
|
):
|
|
"""Test semantic search with MCP sampling for answer generation.
|
|
|
|
This tests the full RAG pipeline:
|
|
1. Semantic search retrieves relevant documents
|
|
2. MCP sampling generates an answer from the retrieved context
|
|
3. OpenAI generates the answer via the sampling callback
|
|
|
|
Uses nc_mcp_client_with_sampling which has OpenAI-based sampling enabled.
|
|
"""
|
|
# Use the 2FA question - has clear expected answer
|
|
test_case = ground_truth_qa[0]
|
|
query = test_case["query"]
|
|
|
|
result = await nc_mcp_client_with_sampling.call_tool(
|
|
"nc_semantic_search_answer",
|
|
arguments={
|
|
"query": query,
|
|
"limit": 5,
|
|
"score_threshold": 0.0,
|
|
"max_answer_tokens": 300,
|
|
},
|
|
)
|
|
|
|
assert result.isError is False, f"Tool call failed: {result}"
|
|
data = result.structuredContent
|
|
|
|
# Verify response structure
|
|
assert data["success"] is True
|
|
assert "query" in data
|
|
assert "generated_answer" in data
|
|
assert "sources" in data
|
|
assert "search_method" in data
|
|
|
|
# Check for either successful sampling or graceful fallback
|
|
fallback_methods = {
|
|
"semantic_sampling_unsupported",
|
|
"semantic_sampling_user_declined",
|
|
"semantic_sampling_timeout",
|
|
"semantic_sampling_mcp_error",
|
|
"semantic_sampling_fallback",
|
|
}
|
|
|
|
if data["search_method"] in fallback_methods:
|
|
# Fallback mode - verify sources still returned
|
|
assert len(data["sources"]) > 0, "Expected sources even in fallback mode"
|
|
pytest.skip(
|
|
f"MCP sampling not available (method: {data['search_method']}), "
|
|
f"but retrieval succeeded with {len(data['sources'])} sources"
|
|
)
|
|
else:
|
|
# Successful sampling - verify answer quality
|
|
assert data["search_method"] == "semantic_sampling"
|
|
assert data["generated_answer"] is not None
|
|
assert len(data["generated_answer"]) > 50 # Non-trivial answer
|
|
|
|
# Check answer contains relevant content
|
|
answer_lower = data["generated_answer"].lower()
|
|
assert any(
|
|
keyword in answer_lower
|
|
for keyword in ["two-factor", "2fa", "authentication", "password"]
|
|
), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"qa_index,min_expected_results",
|
|
[
|
|
(0, 1), # 2FA question
|
|
(1, 1), # File quotas question
|
|
(2, 1), # Linux installation question
|
|
(3, 1), # Windows requirements question
|
|
(4, 1), # Client apps with 2FA question
|
|
],
|
|
)
|
|
async def test_retrieval_quality_all_queries(
|
|
nc_mcp_client, ground_truth_qa, qa_index, min_expected_results
|
|
):
|
|
"""Test retrieval quality for all ground truth queries.
|
|
|
|
Validates that each query returns at least the minimum expected
|
|
number of relevant results from the Nextcloud manual.
|
|
"""
|
|
if qa_index >= len(ground_truth_qa):
|
|
pytest.skip(f"Ground truth index {qa_index} not available")
|
|
|
|
test_case = ground_truth_qa[qa_index]
|
|
query = test_case["query"]
|
|
|
|
result = await nc_mcp_client.call_tool(
|
|
"nc_semantic_search",
|
|
arguments={
|
|
"query": query,
|
|
"limit": 5,
|
|
"score_threshold": 0.0,
|
|
},
|
|
)
|
|
|
|
assert result.isError is False
|
|
data = result.structuredContent
|
|
|
|
assert data["total_found"] >= min_expected_results, (
|
|
f"Query '{query}' returned {data['total_found']} results, "
|
|
f"expected at least {min_expected_results}"
|
|
)
|
|
|
|
|
|
async def test_no_results_for_unrelated_query(nc_mcp_client):
|
|
"""Test that completely unrelated queries return low/no scores.
|
|
|
|
The Nextcloud manual shouldn't have relevant content for
|
|
quantum physics queries.
|
|
"""
|
|
result = await nc_mcp_client.call_tool(
|
|
"nc_semantic_search",
|
|
arguments={
|
|
"query": "quantum entanglement hadron collider particle physics",
|
|
"limit": 5,
|
|
"score_threshold": 0.5, # Higher threshold to filter irrelevant
|
|
},
|
|
)
|
|
|
|
assert result.isError is False
|
|
data = result.structuredContent
|
|
|
|
# Should have few or no high-scoring results
|
|
# Low score threshold means we might get some results, but they should be low quality
|
|
if data["total_found"] > 0:
|
|
# If results exist, they should have low scores
|
|
max_score = max(r["score"] for r in data["results"])
|
|
assert max_score < 0.8, f"Unexpected high score {max_score} for unrelated query"
|