fix: Set is_placeholder=False in processor to fix search filtering

The processor was not setting is_placeholder field when writing real document chunks to Qdrant. This caused the placeholder filter to exclude all documents (since None != False), resulting in 0 search results. Now explicitly sets is_placeholder: False in payload when writing real indexed chunks, allowing search filters to correctly distinguish between placeholders and real documents.
2025-11-20 17:14:13 +01:00
parent 25ef33de7f
commit 5a251a99e6
3 changed files with 52 additions and 11 deletions
@@ -37,19 +37,18 @@ class HealthCheckFilter(logging.Filter):
        """
        # Check if the log message contains health check endpoints
        message = record.getMessage()
-        return (
-            not any(
-                endpoint in message
-                for endpoint in [
-                    "/health/live",
-                    "/health/ready",
-                    "/metrics",
-                    "/app/vector-sync/status",
-                ]
-            )
-            or "OpenCV not installed. Disabling OCR" in message
+        health_check = any(
+            endpoint in message
+            for endpoint in [
+                "/health/live",
+                "/health/ready",
+                "/metrics",
+                "/app/vector-sync/status",
+            ]
        )

+        return not health_check
+

 class TraceContextFormatter(JsonFormatter):
    """
@@ -389,6 +389,7 @@ async def _index_document(
                    "user_id": doc_task.user_id,
                    "doc_id": doc_task.doc_id,
                    "doc_type": doc_task.doc_type,
+                    "is_placeholder": False,  # Real indexed document (not placeholder)
                    "title": title,
                    "excerpt": chunk.text[:200],
                    "indexed_at": indexed_at,
@@ -0,0 +1,41 @@
+import logging
+import pathlib
+
+import anyio
+import pymupdf
+import pymupdf.layout
+
+from nextcloud_mcp_server.client import NextcloudClient
+
+pymupdf.layout.activate()
+import pymupdf4llm  # noqa: E402
+
+client = NextcloudClient.from_env()
+logger = logging.getLogger(__name__)
+
+TMP_DIR = pathlib.Path("/tmp/tmp-images")
+TMP_DIR.mkdir(exist_ok=True, parents=True)
+
+
+async def print_markdown(filename):
+    content, _ = await client.webdav.read_file(filename)
+    doc = pymupdf.open("pdf", content)
+    md_text = pymupdf4llm.to_markdown(doc, write_images=True, image_path=str(TMP_DIR))
+    print(md_text)
+
+
+async def run1():
+    response = await client.webdav.find_by_type("application/pdf")
+    # print(response)
+    for file in response:
+        await print_markdown(file["path"])
+
+
+async def run():
+    tags = await client.tags.get_all_tags()
+    print(tags)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level="INFO")
+    anyio.run(run)