fix: Use WebDAV for tag creation and add LLM-as-a-judge for RAG tests

- Change create_tag() to use WebDAV POST instead of OCS API which
  returned 404 in some Nextcloud versions
- Add llm_judge() helper that evaluates system output against ground
  truth with simple TRUE/FALSE prompt
- Replace keyword-based assertions in RAG tests with LLM judge for
  more flexible semantic evaluation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-11-23 01:56:17 +01:00
parent bf2fdac2d0
commit 2ab8dad6a5
2 changed files with 64 additions and 27 deletions
+17 -13
View File
@@ -1398,7 +1398,7 @@ class WebDAVClient(BaseNextcloudClient):
user_visible: bool = True,
user_assignable: bool = True,
) -> dict[str, Any]:
"""Create a system tag via OCS API.
"""Create a system tag via WebDAV.
Args:
name: Name of the tag to create
@@ -1411,12 +1411,10 @@ class WebDAVClient(BaseNextcloudClient):
Raises:
HTTPStatusError: If tag creation fails (409 if already exists)
"""
# Use WebDAV POST with JSON body to create tag
response = await self._client.post(
"/ocs/v2.php/apps/systemtags/api/v1/tags",
headers={
"OCS-APIRequest": "true",
"Content-Type": "application/json",
},
"/remote.php/dav/systemtags/",
headers={"Content-Type": "application/json"},
json={
"name": name,
"userVisible": user_visible,
@@ -1425,15 +1423,21 @@ class WebDAVClient(BaseNextcloudClient):
)
response.raise_for_status()
# Parse OCS response
data = response.json()
ocs_data = data.get("ocs", {}).get("data", {})
# Extract tag ID from Content-Location header (e.g., /remote.php/dav/systemtags/42)
content_location = response.headers.get("Content-Location", "")
tag_id = None
if content_location:
# Extract the numeric ID from the path
try:
tag_id = int(content_location.rstrip("/").split("/")[-1])
except (ValueError, IndexError):
pass
tag_info = {
"id": ocs_data.get("id"),
"name": ocs_data.get("name", name),
"userVisible": ocs_data.get("userVisible", user_visible),
"userAssignable": ocs_data.get("userAssignable", user_assignable),
"id": tag_id,
"name": name,
"userVisible": user_visible,
"userAssignable": user_assignable,
}
logger.info(f"Created tag '{name}' with ID {tag_info['id']}")
+47 -14
View File
@@ -42,6 +42,34 @@ logger = logging.getLogger(__name__)
# Default path to the Nextcloud User Manual PDF
DEFAULT_MANUAL_PATH = "Nextcloud Manual.pdf"
async def llm_judge(
provider: "OpenAIProvider",
ground_truth: str,
system_output: str,
) -> bool:
"""Use LLM to judge if system output aligns with ground truth.
Args:
provider: OpenAI provider with generation capability
ground_truth: The expected/reference answer
system_output: The system's actual output to evaluate
Returns:
True if output aligns with ground truth, False otherwise
"""
prompt = f"""GROUND TRUTH: {ground_truth}
SYSTEM OUTPUT: {system_output}
Does the system output contain the key facts from the ground truth?
Answer: TRUE or FALSE"""
response = await provider.generate(prompt, max_tokens=10)
return "TRUE" in response.upper()
# Skip all tests if OpenAI API key not configured
pytestmark = [
pytest.mark.integration,
@@ -218,7 +246,7 @@ async def test_openai_embeddings_work(openai_provider: OpenAIProvider):
async def test_semantic_search_retrieval(
nc_mcp_client, ground_truth_qa, indexed_manual_pdf
nc_mcp_client, ground_truth_qa, indexed_manual_pdf, openai_generation_provider
):
"""Test that semantic search retrieves relevant documents from the manual.
@@ -228,7 +256,6 @@ async def test_semantic_search_retrieval(
# Use first query from ground truth
test_case = ground_truth_qa[0] # 2FA question
query = test_case["query"]
expected_topics = test_case["expected_topics"]
# Perform semantic search via MCP tool
result = await nc_mcp_client.call_tool(
@@ -248,16 +275,21 @@ async def test_semantic_search_retrieval(
assert data["total_found"] > 0, f"No results for query: {query}"
assert len(data["results"]) > 0
# Check that at least one result contains expected topic keywords
all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
assert topic_found, (
f"Expected topics {expected_topics} not found in results for query: {query}"
# Use LLM judge to evaluate if excerpts are relevant to ground truth
all_excerpts = " ".join([r["excerpt"] for r in data["results"]])
is_relevant = await llm_judge(
openai_generation_provider,
test_case["ground_truth"],
all_excerpts,
)
assert is_relevant, f"LLM judge: excerpts not relevant to query: {query}"
async def test_semantic_search_answer_with_sampling(
nc_mcp_client_with_sampling, ground_truth_qa, indexed_manual_pdf
nc_mcp_client_with_sampling,
ground_truth_qa,
indexed_manual_pdf,
openai_generation_provider,
):
"""Test semantic search with MCP sampling for answer generation.
@@ -314,12 +346,13 @@ async def test_semantic_search_answer_with_sampling(
assert data["generated_answer"] is not None
assert len(data["generated_answer"]) > 50 # Non-trivial answer
# Check answer contains relevant content
answer_lower = data["generated_answer"].lower()
assert any(
keyword in answer_lower
for keyword in ["two-factor", "2fa", "authentication", "password"]
), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
# Use LLM judge to evaluate answer relevance
is_relevant = await llm_judge(
openai_generation_provider,
test_case["ground_truth"],
data["generated_answer"],
)
assert is_relevant, f"LLM judge: answer not relevant to query: {query}"
@pytest.mark.parametrize(