fix: Use WebDAV for tag creation and add LLM-as-a-judge for RAG tests
- Change create_tag() to use WebDAV POST instead of OCS API which returned 404 in some Nextcloud versions - Add llm_judge() helper that evaluates system output against ground truth with simple TRUE/FALSE prompt - Replace keyword-based assertions in RAG tests with LLM judge for more flexible semantic evaluation 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1398,7 +1398,7 @@ class WebDAVClient(BaseNextcloudClient):
|
||||
user_visible: bool = True,
|
||||
user_assignable: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
"""Create a system tag via OCS API.
|
||||
"""Create a system tag via WebDAV.
|
||||
|
||||
Args:
|
||||
name: Name of the tag to create
|
||||
@@ -1411,12 +1411,10 @@ class WebDAVClient(BaseNextcloudClient):
|
||||
Raises:
|
||||
HTTPStatusError: If tag creation fails (409 if already exists)
|
||||
"""
|
||||
# Use WebDAV POST with JSON body to create tag
|
||||
response = await self._client.post(
|
||||
"/ocs/v2.php/apps/systemtags/api/v1/tags",
|
||||
headers={
|
||||
"OCS-APIRequest": "true",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
"/remote.php/dav/systemtags/",
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"name": name,
|
||||
"userVisible": user_visible,
|
||||
@@ -1425,15 +1423,21 @@ class WebDAVClient(BaseNextcloudClient):
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse OCS response
|
||||
data = response.json()
|
||||
ocs_data = data.get("ocs", {}).get("data", {})
|
||||
# Extract tag ID from Content-Location header (e.g., /remote.php/dav/systemtags/42)
|
||||
content_location = response.headers.get("Content-Location", "")
|
||||
tag_id = None
|
||||
if content_location:
|
||||
# Extract the numeric ID from the path
|
||||
try:
|
||||
tag_id = int(content_location.rstrip("/").split("/")[-1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
tag_info = {
|
||||
"id": ocs_data.get("id"),
|
||||
"name": ocs_data.get("name", name),
|
||||
"userVisible": ocs_data.get("userVisible", user_visible),
|
||||
"userAssignable": ocs_data.get("userAssignable", user_assignable),
|
||||
"id": tag_id,
|
||||
"name": name,
|
||||
"userVisible": user_visible,
|
||||
"userAssignable": user_assignable,
|
||||
}
|
||||
|
||||
logger.info(f"Created tag '{name}' with ID {tag_info['id']}")
|
||||
|
||||
@@ -42,6 +42,34 @@ logger = logging.getLogger(__name__)
|
||||
# Default path to the Nextcloud User Manual PDF
|
||||
DEFAULT_MANUAL_PATH = "Nextcloud Manual.pdf"
|
||||
|
||||
|
||||
async def llm_judge(
|
||||
provider: "OpenAIProvider",
|
||||
ground_truth: str,
|
||||
system_output: str,
|
||||
) -> bool:
|
||||
"""Use LLM to judge if system output aligns with ground truth.
|
||||
|
||||
Args:
|
||||
provider: OpenAI provider with generation capability
|
||||
ground_truth: The expected/reference answer
|
||||
system_output: The system's actual output to evaluate
|
||||
|
||||
Returns:
|
||||
True if output aligns with ground truth, False otherwise
|
||||
"""
|
||||
prompt = f"""GROUND TRUTH: {ground_truth}
|
||||
|
||||
SYSTEM OUTPUT: {system_output}
|
||||
|
||||
Does the system output contain the key facts from the ground truth?
|
||||
|
||||
Answer: TRUE or FALSE"""
|
||||
|
||||
response = await provider.generate(prompt, max_tokens=10)
|
||||
return "TRUE" in response.upper()
|
||||
|
||||
|
||||
# Skip all tests if OpenAI API key not configured
|
||||
pytestmark = [
|
||||
pytest.mark.integration,
|
||||
@@ -218,7 +246,7 @@ async def test_openai_embeddings_work(openai_provider: OpenAIProvider):
|
||||
|
||||
|
||||
async def test_semantic_search_retrieval(
|
||||
nc_mcp_client, ground_truth_qa, indexed_manual_pdf
|
||||
nc_mcp_client, ground_truth_qa, indexed_manual_pdf, openai_generation_provider
|
||||
):
|
||||
"""Test that semantic search retrieves relevant documents from the manual.
|
||||
|
||||
@@ -228,7 +256,6 @@ async def test_semantic_search_retrieval(
|
||||
# Use first query from ground truth
|
||||
test_case = ground_truth_qa[0] # 2FA question
|
||||
query = test_case["query"]
|
||||
expected_topics = test_case["expected_topics"]
|
||||
|
||||
# Perform semantic search via MCP tool
|
||||
result = await nc_mcp_client.call_tool(
|
||||
@@ -248,16 +275,21 @@ async def test_semantic_search_retrieval(
|
||||
assert data["total_found"] > 0, f"No results for query: {query}"
|
||||
assert len(data["results"]) > 0
|
||||
|
||||
# Check that at least one result contains expected topic keywords
|
||||
all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
|
||||
topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
|
||||
assert topic_found, (
|
||||
f"Expected topics {expected_topics} not found in results for query: {query}"
|
||||
# Use LLM judge to evaluate if excerpts are relevant to ground truth
|
||||
all_excerpts = " ".join([r["excerpt"] for r in data["results"]])
|
||||
is_relevant = await llm_judge(
|
||||
openai_generation_provider,
|
||||
test_case["ground_truth"],
|
||||
all_excerpts,
|
||||
)
|
||||
assert is_relevant, f"LLM judge: excerpts not relevant to query: {query}"
|
||||
|
||||
|
||||
async def test_semantic_search_answer_with_sampling(
|
||||
nc_mcp_client_with_sampling, ground_truth_qa, indexed_manual_pdf
|
||||
nc_mcp_client_with_sampling,
|
||||
ground_truth_qa,
|
||||
indexed_manual_pdf,
|
||||
openai_generation_provider,
|
||||
):
|
||||
"""Test semantic search with MCP sampling for answer generation.
|
||||
|
||||
@@ -314,12 +346,13 @@ async def test_semantic_search_answer_with_sampling(
|
||||
assert data["generated_answer"] is not None
|
||||
assert len(data["generated_answer"]) > 50 # Non-trivial answer
|
||||
|
||||
# Check answer contains relevant content
|
||||
answer_lower = data["generated_answer"].lower()
|
||||
assert any(
|
||||
keyword in answer_lower
|
||||
for keyword in ["two-factor", "2fa", "authentication", "password"]
|
||||
), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
|
||||
# Use LLM judge to evaluate answer relevance
|
||||
is_relevant = await llm_judge(
|
||||
openai_generation_provider,
|
||||
test_case["ground_truth"],
|
||||
data["generated_answer"],
|
||||
)
|
||||
assert is_relevant, f"LLM judge: answer not relevant to query: {query}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
Reference in New Issue
Block a user