diff --git a/nextcloud_mcp_server/client/webdav.py b/nextcloud_mcp_server/client/webdav.py
index 5a5f0cd..e8b3f6f 100644
--- a/nextcloud_mcp_server/client/webdav.py
+++ b/nextcloud_mcp_server/client/webdav.py
@@ -1398,7 +1398,7 @@ class WebDAVClient(BaseNextcloudClient):
         user_visible: bool = True,
         user_assignable: bool = True,
     ) -> dict[str, Any]:
-        """Create a system tag via OCS API.
+        """Create a system tag via WebDAV.
 
         Args:
             name: Name of the tag to create
@@ -1411,12 +1411,10 @@ class WebDAVClient(BaseNextcloudClient):
         Raises:
             HTTPStatusError: If tag creation fails (409 if already exists)
         """
+        # Use WebDAV POST with JSON body to create tag
         response = await self._client.post(
-            "/ocs/v2.php/apps/systemtags/api/v1/tags",
-            headers={
-                "OCS-APIRequest": "true",
-                "Content-Type": "application/json",
-            },
+            "/remote.php/dav/systemtags/",
+            headers={"Content-Type": "application/json"},
             json={
                 "name": name,
                 "userVisible": user_visible,
@@ -1425,15 +1423,21 @@ class WebDAVClient(BaseNextcloudClient):
         )
         response.raise_for_status()
 
-        # Parse OCS response
-        data = response.json()
-        ocs_data = data.get("ocs", {}).get("data", {})
+        # Extract tag ID from Content-Location header (e.g., /remote.php/dav/systemtags/42)
+        content_location = response.headers.get("Content-Location", "")
+        tag_id = None
+        if content_location:
+            # Extract the numeric ID from the path
+            try:
+                tag_id = int(content_location.rstrip("/").split("/")[-1])
+            except (ValueError, IndexError):
+                pass
 
         tag_info = {
-            "id": ocs_data.get("id"),
-            "name": ocs_data.get("name", name),
-            "userVisible": ocs_data.get("userVisible", user_visible),
-            "userAssignable": ocs_data.get("userAssignable", user_assignable),
+            "id": tag_id,
+            "name": name,
+            "userVisible": user_visible,
+            "userAssignable": user_assignable,
         }
 
         logger.info(f"Created tag '{name}' with ID {tag_info['id']}")
diff --git a/tests/integration/test_rag_openai.py b/tests/integration/test_rag_openai.py
index 8f56495..1f750fc 100644
--- a/tests/integration/test_rag_openai.py
+++ b/tests/integration/test_rag_openai.py
@@ -42,6 +42,34 @@ logger = logging.getLogger(__name__)
 # Default path to the Nextcloud User Manual PDF
 DEFAULT_MANUAL_PATH = "Nextcloud Manual.pdf"
 
+
+async def llm_judge(
+    provider: "OpenAIProvider",
+    ground_truth: str,
+    system_output: str,
+) -> bool:
+    """Use LLM to judge if system output aligns with ground truth.
+
+    Args:
+        provider: OpenAI provider with generation capability
+        ground_truth: The expected/reference answer
+        system_output: The system's actual output to evaluate
+
+    Returns:
+        True if output aligns with ground truth, False otherwise
+    """
+    prompt = f"""GROUND TRUTH: {ground_truth}
+
+SYSTEM OUTPUT: {system_output}
+
+Does the system output contain the key facts from the ground truth?
+
+Answer: TRUE or FALSE"""
+
+    response = await provider.generate(prompt, max_tokens=10)
+    return "TRUE" in response.upper()
+
+
 # Skip all tests if OpenAI API key not configured
 pytestmark = [
     pytest.mark.integration,
@@ -218,7 +246,7 @@ async def test_openai_embeddings_work(openai_provider: OpenAIProvider):
 
 
 async def test_semantic_search_retrieval(
-    nc_mcp_client, ground_truth_qa, indexed_manual_pdf
+    nc_mcp_client, ground_truth_qa, indexed_manual_pdf, openai_generation_provider
 ):
     """Test that semantic search retrieves relevant documents from the manual.
 
@@ -228,7 +256,6 @@ async def test_semantic_search_retrieval(
     # Use first query from ground truth
     test_case = ground_truth_qa[0]  # 2FA question
     query = test_case["query"]
-    expected_topics = test_case["expected_topics"]
 
     # Perform semantic search via MCP tool
     result = await nc_mcp_client.call_tool(
@@ -248,16 +275,21 @@ async def test_semantic_search_retrieval(
     assert data["total_found"] > 0, f"No results for query: {query}"
     assert len(data["results"]) > 0
 
-    # Check that at least one result contains expected topic keywords
-    all_excerpts = " ".join([r["excerpt"].lower() for r in data["results"]])
-    topic_found = any(topic.lower() in all_excerpts for topic in expected_topics)
-    assert topic_found, (
-        f"Expected topics {expected_topics} not found in results for query: {query}"
+    # Use LLM judge to evaluate if excerpts are relevant to ground truth
+    all_excerpts = " ".join([r["excerpt"] for r in data["results"]])
+    is_relevant = await llm_judge(
+        openai_generation_provider,
+        test_case["ground_truth"],
+        all_excerpts,
     )
+    assert is_relevant, f"LLM judge: excerpts not relevant to query: {query}"
 
 
 async def test_semantic_search_answer_with_sampling(
-    nc_mcp_client_with_sampling, ground_truth_qa, indexed_manual_pdf
+    nc_mcp_client_with_sampling,
+    ground_truth_qa,
+    indexed_manual_pdf,
+    openai_generation_provider,
 ):
     """Test semantic search with MCP sampling for answer generation.
 
@@ -314,12 +346,13 @@ async def test_semantic_search_answer_with_sampling(
         assert data["generated_answer"] is not None
         assert len(data["generated_answer"]) > 50  # Non-trivial answer
 
-        # Check answer contains relevant content
-        answer_lower = data["generated_answer"].lower()
-        assert any(
-            keyword in answer_lower
-            for keyword in ["two-factor", "2fa", "authentication", "password"]
-        ), f"Answer doesn't seem relevant to query: {data['generated_answer'][:200]}"
+        # Use LLM judge to evaluate answer relevance
+        is_relevant = await llm_judge(
+            openai_generation_provider,
+            test_case["ground_truth"],
+            data["generated_answer"],
+        )
+        assert is_relevant, f"LLM judge: answer not relevant to query: {query}"
 
 
 @pytest.mark.parametrize(