fix: Use WebDAV for tag creation and add LLM-as-a-judge for RAG tests

- Change create_tag() to use WebDAV POST instead of OCS API which returned 404 in some Nextcloud versions - Add llm_judge() helper that evaluates system output against ground truth with simple TRUE/FALSE prompt - Replace keyword-based assertions in RAG tests with LLM judge for more flexible semantic evaluation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-23 01:56:17 +01:00
parent bf2fdac2d0
commit 2ab8dad6a5
2 changed files with 64 additions and 27 deletions
@@ -1398,7 +1398,7 @@ class WebDAVClient(BaseNextcloudClient):
        user_visible: bool = True,
        user_assignable: bool = True,
    ) -> dict[str, Any]:
-        """Create a system tag via OCS API.
+        """Create a system tag via WebDAV.

        Args:
            name: Name of the tag to create
@@ -1411,12 +1411,10 @@ class WebDAVClient(BaseNextcloudClient):
        Raises:
            HTTPStatusError: If tag creation fails (409 if already exists)
        """
+        # Use WebDAV POST with JSON body to create tag
        response = await self._client.post(
-            "/ocs/v2.php/apps/systemtags/api/v1/tags",
-            headers={
-                "OCS-APIRequest": "true",
-                "Content-Type": "application/json",
-            },
+            "/remote.php/dav/systemtags/",
+            headers={"Content-Type": "application/json"},
            json={
                "name": name,
                "userVisible": user_visible,
@@ -1425,15 +1423,21 @@ class WebDAVClient(BaseNextcloudClient):
        )
        response.raise_for_status()

-        # Parse OCS response
-        data = response.json()
-        ocs_data = data.get("ocs", {}).get("data", {})
+        # Extract tag ID from Content-Location header (e.g., /remote.php/dav/systemtags/42)
+        content_location = response.headers.get("Content-Location", "")
+        tag_id = None
+        if content_location:
+            # Extract the numeric ID from the path
+            try:
+                tag_id = int(content_location.rstrip("/").split("/")[-1])
+            except (ValueError, IndexError):
+                pass

        tag_info = {
-            "id": ocs_data.get("id"),
-            "name": ocs_data.get("name", name),
-            "userVisible": ocs_data.get("userVisible", user_visible),
-            "userAssignable": ocs_data.get("userAssignable", user_assignable),
+            "id": tag_id,
+            "name": name,
+            "userVisible": user_visible,
+            "userAssignable": user_assignable,
        }

        logger.info(f"Created tag '{name}' with ID {tag_info['id']}")
@@ -42,6 +42,34 @@ logger = logging.getLogger(__name__)
 # Default path to the Nextcloud User Manual PDF
 DEFAULT_MANUAL_PATH = "Nextcloud Manual.pdf"

+
+async def llm_judge(
+    provider: "OpenAIProvider",
+    ground_truth: str,
+    system_output: str,
+) -> bool:
+    """Use LLM to judge if system output aligns with ground truth.
+
+    Args:
+        provider: OpenAI provider with generation capability
+        ground_truth: The expected/reference answer
+        system_output: The system's actual output to evaluate
+
+    Returns:
+        True if output aligns with ground truth, False otherwise
+    """
+    prompt = f"""GROUND TRUTH: {ground_truth}
+
+SYSTEM OUTPUT: {system_output}
+
+Does the system output contain the key facts from the ground truth?
+
+Answer: TRUE or FALSE"""
+
+    response = await provider.generate(prompt, max_tokens=10)
+    return "TRUE" in response.upper()
+
+
 # Skip all tests if OpenAI API key not configured
 pytestmark = [
    pytest.mark.integration,
@@ -218,7 +246,7 @@ async def test_openai_embeddings_work(openai_provider: OpenAIProvider):


 async def test_semantic_search_retrieval(
-    nc_mcp_client, ground_truth_qa, indexed_manual_pdf
+    nc_mcp_client, ground_truth_qa, indexed_manual_pdf, openai_generation_provider
 ):
    """Test that semantic search retrieves relevant documents from the manual.

@@ -228,7 +256,6 @@ async def test_semantic_search_retrieval(
    # Use first query from ground truth
    test_case = ground_truth_qa[0]  # 2FA question
    query = test_case["query"]
-    expected_topics = test_case["expected_topics"]

    # Perform semantic search via MCP tool
    result = await nc_mcp_client.call_tool(
@@ -248,16 +275,21 @@ async def test_semantic_search_retrieval(
    assert data["total_found"] > 0, f"No results for query: {query}"
    assert len(data["results"]) > 0

-    # Check that at least one result contains expected topic keywords
-    all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
-    topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
-    assert topic_found, (
-        f"Expected topics {expected_topics} not found in results for query: {query}"
+    # Use LLM judge to evaluate if excerpts are relevant to ground truth
+    all_excerpts = " ".join([r["excerpt"] for r in data["results"]])
+    is_relevant = await llm_judge(
+        openai_generation_provider,
+        test_case["ground_truth"],
+        all_excerpts,
    )
+    assert is_relevant, f"LLM judge: excerpts not relevant to query: {query}"


 async def test_semantic_search_answer_with_sampling(
-    nc_mcp_client_with_sampling, ground_truth_qa, indexed_manual_pdf
+    nc_mcp_client_with_sampling,
+    ground_truth_qa,
+    indexed_manual_pdf,
+    openai_generation_provider,
 ):
    """Test semantic search with MCP sampling for answer generation.

@@ -314,12 +346,13 @@ async def test_semantic_search_answer_with_sampling(
        assert data["generated_answer"] is not None
        assert len(data["generated_answer"]) > 50  # Non-trivial answer

-        # Check answer contains relevant content
-        answer_lower = data["generated_answer"].lower()
-        assert any(
-            keyword in answer_lower
-            for keyword in ["two-factor", "2fa", "authentication", "password"]
-        ), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
+        # Use LLM judge to evaluate answer relevance
+        is_relevant = await llm_judge(
+            openai_generation_provider,
+            test_case["ground_truth"],
+            data["generated_answer"],
+        )
+        assert is_relevant, f"LLM judge: answer not relevant to query: {query}"


@pytest.mark.parametrize(