Files
Chris Coutinho c272ddd82d feat: implement RAG evaluation framework with CLI tooling
- Add ADR-013 documenting RAG evaluation architecture
- Implement two-part evaluation: Context Recall (retrieval) + Answer Correctness (generation)
- Create Click CLI for ground truth generation and corpus upload
- Add pytest fixtures and tests for retrieval/generation quality
- Use BeIR/nfcorpus dataset with 5 selected test queries
- Support Ollama and Anthropic LLM providers
- Generate synthetic ground truth answers offline
- Add comprehensive documentation in tests/rag_evaluation/README.md

The framework separates one-time setup (generate/upload) from test execution,
making tests much faster (~6-12 min vs ~15-25 min per run).

Tests are manual only (not in CI) and require external LLM access.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 23:11:21 +01:00

140 lines
4.8 KiB
Python

"""Tests for RAG generation quality (Answer Correctness metric).
These tests evaluate whether the MCP client LLM generates factually correct
answers from retrieved context using the nc_semantic_search_answer tool.
Metric: Answer Correctness
- Measures: Is the generated answer factually correct?
- Method: LLM-as-judge - Compare RAG answer vs ground truth (binary true/false)
- Evaluation: External LLM evaluates semantic equivalence
"""
import pytest
@pytest.mark.integration
async def test_answer_correctness(
mcp_sampling_client,
evaluation_llm,
nfcorpus_test_data,
):
"""Test that RAG system generates factually correct answers.
For each test query:
1. Execute full RAG pipeline via nc_semantic_search_answer MCP tool
2. Extract generated answer from RAG response
3. Use LLM-as-judge to compare against ground truth (binary true/false)
4. Assert answer is semantically equivalent to ground truth
This tests the quality of the generation component (MCP client LLM).
"""
results_summary = []
for test_case in nfcorpus_test_data:
query = test_case["query_text"]
ground_truth = test_case["ground_truth_answer"]
print(f"\n{'=' * 80}")
print(f"Query: {query}")
# Execute full RAG pipeline
print("Executing RAG pipeline...")
rag_result = await mcp_sampling_client.call_tool(
"nc_semantic_search_answer",
arguments={"query": query, "limit": 5},
)
rag_answer = rag_result["generated_answer"]
print(f"RAG Answer preview: {rag_answer[:200]}...")
print(f"Ground Truth preview: {ground_truth[:200]}...")
# LLM-as-judge evaluation
evaluation_prompt = f"""Compare these two answers and respond with only TRUE or FALSE.
Question: {query}
Generated Answer: {rag_answer}
Ground Truth Answer: {ground_truth}
Are these answers semantically equivalent (do they convey the same factual information)?
Respond with only: TRUE or FALSE"""
print("Evaluating answer correctness...")
evaluation_result = await evaluation_llm.generate(
evaluation_prompt,
max_tokens=10,
)
is_correct = evaluation_result.strip().upper() == "TRUE"
result = {
"query_id": test_case["query_id"],
"query": query,
"rag_answer_length": len(rag_answer),
"ground_truth_length": len(ground_truth),
"is_correct": is_correct,
"evaluation_result": evaluation_result.strip(),
}
results_summary.append(result)
print(f" Evaluation: {evaluation_result.strip()}")
print(f" Status: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")
# Assert answer correctness
assert is_correct, (
f"Answer mismatch for query: {query}\n\n"
f"Generated Answer:\n{rag_answer}\n\n"
f"Ground Truth:\n{ground_truth}\n\n"
f"Evaluation: {evaluation_result.strip()}"
)
# Print summary
print(f"\n{'=' * 80}")
print("Answer Correctness Summary:")
print(f" Total queries: {len(results_summary)}")
print(f" Correct: {sum(r['is_correct'] for r in results_summary)}")
print(f" Incorrect: {sum(not r['is_correct'] for r in results_summary)}")
accuracy = sum(r["is_correct"] for r in results_summary) / len(results_summary)
print(f" Accuracy: {accuracy:.2%}")
print(f"{'=' * 80}")
@pytest.mark.integration
async def test_answer_contains_sources(mcp_sampling_client, nfcorpus_test_data):
"""Test that RAG answers include source citations.
This is a basic quality check - we verify that the nc_semantic_search_answer
tool returns both a generated answer and source documents.
"""
for test_case in nfcorpus_test_data:
query = test_case["query_text"]
# Execute RAG pipeline
rag_result = await mcp_sampling_client.call_tool(
"nc_semantic_search_answer",
arguments={"query": query, "limit": 5},
)
# Check response structure
assert "generated_answer" in rag_result, "Response missing 'generated_answer'"
assert "sources" in rag_result, "Response missing 'sources'"
# Check sources are provided
sources = rag_result["sources"]
assert len(sources) > 0, f"No sources returned for query: {query}"
# Check each source has required fields
for i, source in enumerate(sources):
assert "document_id" in source or "id" in source, (
f"Source {i} missing document ID"
)
assert "excerpt" in source or "content" in source or "text" in source, (
f"Source {i} missing content"
)
print(f"Query: {query}")
print(f" Sources provided: {len(sources)}")
print(" Status: ✓ PASS")