fix: Set is_placeholder=False in processor to fix search filtering

The processor was not setting is_placeholder field when writing real document chunks to Qdrant. This caused the placeholder filter to exclude all documents (since None != False), resulting in 0 search results. Now explicitly sets is_placeholder: False in payload when writing real indexed chunks, allowing search filters to correctly distinguish between placeholders and real documents.
2025-11-20 17:14:13 +01:00
parent 25ef33de7f
commit 5a251a99e6
3 changed files with 52 additions and 11 deletions
@@ -37,19 +37,18 @@ class HealthCheckFilter(logging.Filter):
        """
        # Check if the log message contains health check endpoints
        message = record.getMessage()
-        return (
+        health_check = any(
-            not any(
+            endpoint in message
-                endpoint in message
+            for endpoint in [
-                for endpoint in [
+                "/health/live",
-                    "/health/live",
+                "/health/ready",
-                    "/health/ready",
+                "/metrics",
-                    "/metrics",
+                "/app/vector-sync/status",
-                    "/app/vector-sync/status",
+            ]
                ]
            )
            or "OpenCV not installed. Disabling OCR" in message
        )
        return not health_check
 class TraceContextFormatter(JsonFormatter):
    """
@@ -389,6 +389,7 @@ async def _index_document(
                    "user_id": doc_task.user_id,
                    "doc_id": doc_task.doc_id,
                    "doc_type": doc_task.doc_type,
                    "is_placeholder": False,  # Real indexed document (not placeholder)
                    "title": title,
                    "excerpt": chunk.text[:200],
                    "indexed_at": indexed_at,
@@ -0,0 +1,41 @@
 import logging
 import pathlib
 import anyio
 import pymupdf
 import pymupdf.layout
 from nextcloud_mcp_server.client import NextcloudClient
 pymupdf.layout.activate()
 import pymupdf4llm  # noqa: E402
 client = NextcloudClient.from_env()
 logger = logging.getLogger(__name__)
 TMP_DIR = pathlib.Path("/tmp/tmp-images")
 TMP_DIR.mkdir(exist_ok=True, parents=True)
 async def print_markdown(filename):
    content, _ = await client.webdav.read_file(filename)
    doc = pymupdf.open("pdf", content)
    md_text = pymupdf4llm.to_markdown(doc, write_images=True, image_path=str(TMP_DIR))
    print(md_text)
 async def run1():
    response = await client.webdav.find_by_type("application/pdf")
    # print(response)
    for file in response:
        await print_markdown(file["path"])
 async def run():
    tags = await client.tags.get_all_tags()
    print(tags)
 if __name__ == "__main__":
    logging.basicConfig(level="INFO")
    anyio.run(run)