c272ddd82d
- Add ADR-013 documenting RAG evaluation architecture - Implement two-part evaluation: Context Recall (retrieval) + Answer Correctness (generation) - Create Click CLI for ground truth generation and corpus upload - Add pytest fixtures and tests for retrieval/generation quality - Use BeIR/nfcorpus dataset with 5 selected test queries - Support Ollama and Anthropic LLM providers - Generate synthetic ground truth answers offline - Add comprehensive documentation in tests/rag_evaluation/README.md The framework separates one-time setup (generate/upload) from test execution, making tests much faster (~6-12 min vs ~15-25 min per run). Tests are manual only (not in CI) and require external LLM access. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
146 lines
4.5 KiB
Python
146 lines
4.5 KiB
Python
"""Pytest fixtures for RAG evaluation tests.
|
|
|
|
IMPORTANT: Before running these tests, you must:
|
|
1. Generate ground truth: uv run python tools/rag_eval_cli.py generate
|
|
2. Upload corpus: uv run python tools/rag_eval_cli.py upload --nextcloud-url http://localhost:8000 --username admin --password admin
|
|
|
|
This ensures that the ground truth and note mappings are available.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from tests.rag_evaluation.llm_providers import create_llm_provider
|
|
|
|
# Paths
|
|
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
GROUND_TRUTH_FILE = FIXTURES_DIR / "ground_truth.json"
|
|
NOTE_MAPPING_FILE = FIXTURES_DIR / "note_mapping.json"
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def ground_truth_data() -> list[dict[str, Any]]:
|
|
"""Load pre-generated ground truth data.
|
|
|
|
Returns:
|
|
List of test cases with query, ground truth answer, and expected doc IDs
|
|
|
|
Raises:
|
|
FileNotFoundError: If ground_truth.json doesn't exist
|
|
"""
|
|
if not GROUND_TRUTH_FILE.exists():
|
|
raise FileNotFoundError(
|
|
f"Ground truth file not found: {GROUND_TRUTH_FILE}\n"
|
|
"Run: uv run python tools/rag_eval_cli.py generate"
|
|
)
|
|
|
|
with open(GROUND_TRUTH_FILE) as f:
|
|
return json.load(f)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def note_mapping() -> dict[str, int]:
|
|
"""Load document ID → note ID mapping.
|
|
|
|
Returns:
|
|
Dict mapping nfcorpus document ID to Nextcloud note ID
|
|
|
|
Raises:
|
|
FileNotFoundError: If note_mapping.json doesn't exist
|
|
"""
|
|
if not NOTE_MAPPING_FILE.exists():
|
|
raise FileNotFoundError(
|
|
f"Note mapping file not found: {NOTE_MAPPING_FILE}\n"
|
|
"Run: uv run python tools/rag_eval_cli.py upload --nextcloud-url ... --username ... --password ..."
|
|
)
|
|
|
|
with open(NOTE_MAPPING_FILE) as f:
|
|
return json.load(f)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def nfcorpus_test_data(
|
|
ground_truth_data: list[dict[str, Any]],
|
|
note_mapping: dict[str, int],
|
|
):
|
|
"""Prepare nfcorpus test data for evaluation.
|
|
|
|
This fixture combines ground truth answers with note mappings to create
|
|
test cases ready for retrieval and generation quality tests.
|
|
|
|
Args:
|
|
ground_truth_data: Pre-generated ground truth answers
|
|
note_mapping: Document ID → note ID mapping
|
|
|
|
Returns:
|
|
List of test cases with query, ground truth, expected doc IDs, and note IDs
|
|
"""
|
|
test_cases = []
|
|
|
|
for gt in ground_truth_data:
|
|
# Map expected document IDs to note IDs
|
|
expected_note_ids = [
|
|
note_mapping.get(doc_id)
|
|
for doc_id in gt["expected_document_ids"]
|
|
if doc_id in note_mapping
|
|
]
|
|
|
|
# Filter out None values (docs that weren't uploaded)
|
|
expected_note_ids = [nid for nid in expected_note_ids if nid is not None]
|
|
|
|
test_cases.append(
|
|
{
|
|
"query_id": gt["query_id"],
|
|
"query_text": gt["query_text"],
|
|
"ground_truth_answer": gt["ground_truth_answer"],
|
|
"expected_document_ids": gt["expected_document_ids"],
|
|
"expected_note_ids": expected_note_ids,
|
|
"highly_relevant_count": gt["highly_relevant_count"],
|
|
}
|
|
)
|
|
|
|
return test_cases
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
async def evaluation_llm():
|
|
"""Create LLM provider for evaluation (separate from MCP client).
|
|
|
|
Environment variables:
|
|
RAG_EVAL_PROVIDER: Provider type (ollama or anthropic)
|
|
RAG_EVAL_OLLAMA_BASE_URL: Ollama base URL (or OLLAMA_HOST)
|
|
RAG_EVAL_OLLAMA_MODEL: Ollama model name
|
|
RAG_EVAL_ANTHROPIC_API_KEY: Anthropic API key
|
|
RAG_EVAL_ANTHROPIC_MODEL: Anthropic model name
|
|
|
|
Returns:
|
|
LLM provider instance (OllamaProvider or AnthropicProvider)
|
|
"""
|
|
llm = create_llm_provider()
|
|
yield llm
|
|
await llm.close()
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
async def mcp_sampling_client():
|
|
"""Create MCP client that supports sampling for RAG generation.
|
|
|
|
This fixture creates an MCP client configured to support sampling,
|
|
which is required for testing the nc_semantic_search_answer tool.
|
|
|
|
TODO: Implement MCP client with sampling support
|
|
For now, this is a placeholder.
|
|
|
|
Returns:
|
|
MCP client instance with sampling enabled
|
|
"""
|
|
# TODO: Implement MCP client creation with sampling support
|
|
# This will require:
|
|
# 1. Creating an MCP client configured for sampling
|
|
# 2. Authenticating with Nextcloud
|
|
# 3. Ensuring sampling is enabled
|
|
pytest.skip("MCP sampling client not yet implemented")
|