c272ddd82d
- Add ADR-013 documenting RAG evaluation architecture - Implement two-part evaluation: Context Recall (retrieval) + Answer Correctness (generation) - Create Click CLI for ground truth generation and corpus upload - Add pytest fixtures and tests for retrieval/generation quality - Use BeIR/nfcorpus dataset with 5 selected test queries - Support Ollama and Anthropic LLM providers - Generate synthetic ground truth answers offline - Add comprehensive documentation in tests/rag_evaluation/README.md The framework separates one-time setup (generate/upload) from test execution, making tests much faster (~6-12 min vs ~15-25 min per run). Tests are manual only (not in CI) and require external LLM access. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
140 lines
4.8 KiB
Python
140 lines
4.8 KiB
Python
"""Tests for RAG generation quality (Answer Correctness metric).
|
|
|
|
These tests evaluate whether the MCP client LLM generates factually correct
|
|
answers from retrieved context using the nc_semantic_search_answer tool.
|
|
|
|
Metric: Answer Correctness
|
|
- Measures: Is the generated answer factually correct?
|
|
- Method: LLM-as-judge - Compare RAG answer vs ground truth (binary true/false)
|
|
- Evaluation: External LLM evaluates semantic equivalence
|
|
"""
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_answer_correctness(
|
|
mcp_sampling_client,
|
|
evaluation_llm,
|
|
nfcorpus_test_data,
|
|
):
|
|
"""Test that RAG system generates factually correct answers.
|
|
|
|
For each test query:
|
|
1. Execute full RAG pipeline via nc_semantic_search_answer MCP tool
|
|
2. Extract generated answer from RAG response
|
|
3. Use LLM-as-judge to compare against ground truth (binary true/false)
|
|
4. Assert answer is semantically equivalent to ground truth
|
|
|
|
This tests the quality of the generation component (MCP client LLM).
|
|
"""
|
|
results_summary = []
|
|
|
|
for test_case in nfcorpus_test_data:
|
|
query = test_case["query_text"]
|
|
ground_truth = test_case["ground_truth_answer"]
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Query: {query}")
|
|
|
|
# Execute full RAG pipeline
|
|
print("Executing RAG pipeline...")
|
|
rag_result = await mcp_sampling_client.call_tool(
|
|
"nc_semantic_search_answer",
|
|
arguments={"query": query, "limit": 5},
|
|
)
|
|
|
|
rag_answer = rag_result["generated_answer"]
|
|
|
|
print(f"RAG Answer preview: {rag_answer[:200]}...")
|
|
print(f"Ground Truth preview: {ground_truth[:200]}...")
|
|
|
|
# LLM-as-judge evaluation
|
|
evaluation_prompt = f"""Compare these two answers and respond with only TRUE or FALSE.
|
|
|
|
Question: {query}
|
|
|
|
Generated Answer: {rag_answer}
|
|
|
|
Ground Truth Answer: {ground_truth}
|
|
|
|
Are these answers semantically equivalent (do they convey the same factual information)?
|
|
Respond with only: TRUE or FALSE"""
|
|
|
|
print("Evaluating answer correctness...")
|
|
evaluation_result = await evaluation_llm.generate(
|
|
evaluation_prompt,
|
|
max_tokens=10,
|
|
)
|
|
|
|
is_correct = evaluation_result.strip().upper() == "TRUE"
|
|
|
|
result = {
|
|
"query_id": test_case["query_id"],
|
|
"query": query,
|
|
"rag_answer_length": len(rag_answer),
|
|
"ground_truth_length": len(ground_truth),
|
|
"is_correct": is_correct,
|
|
"evaluation_result": evaluation_result.strip(),
|
|
}
|
|
results_summary.append(result)
|
|
|
|
print(f" Evaluation: {evaluation_result.strip()}")
|
|
print(f" Status: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")
|
|
|
|
# Assert answer correctness
|
|
assert is_correct, (
|
|
f"Answer mismatch for query: {query}\n\n"
|
|
f"Generated Answer:\n{rag_answer}\n\n"
|
|
f"Ground Truth:\n{ground_truth}\n\n"
|
|
f"Evaluation: {evaluation_result.strip()}"
|
|
)
|
|
|
|
# Print summary
|
|
print(f"\n{'=' * 80}")
|
|
print("Answer Correctness Summary:")
|
|
print(f" Total queries: {len(results_summary)}")
|
|
print(f" Correct: {sum(r['is_correct'] for r in results_summary)}")
|
|
print(f" Incorrect: {sum(not r['is_correct'] for r in results_summary)}")
|
|
accuracy = sum(r["is_correct"] for r in results_summary) / len(results_summary)
|
|
print(f" Accuracy: {accuracy:.2%}")
|
|
print(f"{'=' * 80}")
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_answer_contains_sources(mcp_sampling_client, nfcorpus_test_data):
|
|
"""Test that RAG answers include source citations.
|
|
|
|
This is a basic quality check - we verify that the nc_semantic_search_answer
|
|
tool returns both a generated answer and source documents.
|
|
"""
|
|
for test_case in nfcorpus_test_data:
|
|
query = test_case["query_text"]
|
|
|
|
# Execute RAG pipeline
|
|
rag_result = await mcp_sampling_client.call_tool(
|
|
"nc_semantic_search_answer",
|
|
arguments={"query": query, "limit": 5},
|
|
)
|
|
|
|
# Check response structure
|
|
assert "generated_answer" in rag_result, "Response missing 'generated_answer'"
|
|
assert "sources" in rag_result, "Response missing 'sources'"
|
|
|
|
# Check sources are provided
|
|
sources = rag_result["sources"]
|
|
assert len(sources) > 0, f"No sources returned for query: {query}"
|
|
|
|
# Check each source has required fields
|
|
for i, source in enumerate(sources):
|
|
assert "document_id" in source or "id" in source, (
|
|
f"Source {i} missing document ID"
|
|
)
|
|
assert "excerpt" in source or "content" in source or "text" in source, (
|
|
f"Source {i} missing content"
|
|
)
|
|
|
|
print(f"Query: {query}")
|
|
print(f" Sources provided: {len(sources)}")
|
|
print(" Status: ✓ PASS")
|