nextcloud-mcp-server/tests/rag_evaluation/test_generation_quality.py

"""Tests for RAG generation quality (Answer Correctness metric).

These tests evaluate whether the MCP client LLM generates factually correct
answers from retrieved context using the nc_semantic_search_answer tool.

Metric: Answer Correctness
- Measures: Is the generated answer factually correct?
- Method: LLM-as-judge - Compare RAG answer vs ground truth (binary true/false)
- Evaluation: External LLM evaluates semantic equivalence
"""

import pytest


@pytest.mark.integration
async def test_answer_correctness(
    mcp_sampling_client,
    evaluation_llm,
    nfcorpus_test_data,
):
    """Test that RAG system generates factually correct answers.

    For each test query:
    1. Execute full RAG pipeline via nc_semantic_search_answer MCP tool
    2. Extract generated answer from RAG response
    3. Use LLM-as-judge to compare against ground truth (binary true/false)
    4. Assert answer is semantically equivalent to ground truth

    This tests the quality of the generation component (MCP client LLM).
    """
    results_summary = []

    for test_case in nfcorpus_test_data:
        query = test_case["query_text"]
        ground_truth = test_case["ground_truth_answer"]

        print(f"\n{'=' * 80}")
        print(f"Query: {query}")

        # Execute full RAG pipeline
        print("Executing RAG pipeline...")
        rag_result = await mcp_sampling_client.call_tool(
            "nc_semantic_search_answer",
            arguments={"query": query, "limit": 5},
        )

        rag_answer = rag_result["generated_answer"]

        print(f"RAG Answer preview: {rag_answer[:200]}...")
        print(f"Ground Truth preview: {ground_truth[:200]}...")

        # LLM-as-judge evaluation
        evaluation_prompt = f"""Compare these two answers and respond with only TRUE or FALSE.

Question: {query}

Generated Answer: {rag_answer}

Ground Truth Answer: {ground_truth}

Are these answers semantically equivalent (do they convey the same factual information)?
Respond with only: TRUE or FALSE"""

        print("Evaluating answer correctness...")
        evaluation_result = await evaluation_llm.generate(
            evaluation_prompt,
            max_tokens=10,
        )

        is_correct = evaluation_result.strip().upper() == "TRUE"

        result = {
            "query_id": test_case["query_id"],
            "query": query,
            "rag_answer_length": len(rag_answer),
            "ground_truth_length": len(ground_truth),
            "is_correct": is_correct,
            "evaluation_result": evaluation_result.strip(),
        }
        results_summary.append(result)

        print(f"  Evaluation: {evaluation_result.strip()}")
        print(f"  Status: {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")

        # Assert answer correctness
        assert is_correct, (
            f"Answer mismatch for query: {query}\n\n"
            f"Generated Answer:\n{rag_answer}\n\n"
            f"Ground Truth:\n{ground_truth}\n\n"
            f"Evaluation: {evaluation_result.strip()}"
        )

    # Print summary
    print(f"\n{'=' * 80}")
    print("Answer Correctness Summary:")
    print(f"  Total queries: {len(results_summary)}")
    print(f"  Correct: {sum(r['is_correct'] for r in results_summary)}")
    print(f"  Incorrect: {sum(not r['is_correct'] for r in results_summary)}")
    accuracy = sum(r["is_correct"] for r in results_summary) / len(results_summary)
    print(f"  Accuracy: {accuracy:.2%}")
    print(f"{'=' * 80}")


@pytest.mark.integration
async def test_answer_contains_sources(mcp_sampling_client, nfcorpus_test_data):
    """Test that RAG answers include source citations.

    This is a basic quality check - we verify that the nc_semantic_search_answer
    tool returns both a generated answer and source documents.
    """
    for test_case in nfcorpus_test_data:
        query = test_case["query_text"]

        # Execute RAG pipeline
        rag_result = await mcp_sampling_client.call_tool(
            "nc_semantic_search_answer",
            arguments={"query": query, "limit": 5},
        )

        # Check response structure
        assert "generated_answer" in rag_result, "Response missing 'generated_answer'"
        assert "sources" in rag_result, "Response missing 'sources'"

        # Check sources are provided
        sources = rag_result["sources"]
        assert len(sources) > 0, f"No sources returned for query: {query}"

        # Check each source has required fields
        for i, source in enumerate(sources):
            assert "document_id" in source or "id" in source, (
                f"Source {i} missing document ID"
            )
            assert "excerpt" in source or "content" in source or "text" in source, (
                f"Source {i} missing content"
            )

        print(f"Query: {query}")
        print(f"  Sources provided: {len(sources)}")
        print("  Status: ✓ PASS")