fix: Set is_placeholder=False in processor to fix search filtering

The processor was not setting is_placeholder field when writing real
document chunks to Qdrant. This caused the placeholder filter to exclude
all documents (since None != False), resulting in 0 search results.

Now explicitly sets is_placeholder: False in payload when writing real
indexed chunks, allowing search filters to correctly distinguish between
placeholders and real documents.
This commit is contained in:
Chris Coutinho
2025-11-20 17:14:13 +01:00
parent 25ef33de7f
commit 5a251a99e6
3 changed files with 52 additions and 11 deletions
@@ -37,19 +37,18 @@ class HealthCheckFilter(logging.Filter):
""" """
# Check if the log message contains health check endpoints # Check if the log message contains health check endpoints
message = record.getMessage() message = record.getMessage()
return ( health_check = any(
not any( endpoint in message
endpoint in message for endpoint in [
for endpoint in [ "/health/live",
"/health/live", "/health/ready",
"/health/ready", "/metrics",
"/metrics", "/app/vector-sync/status",
"/app/vector-sync/status", ]
]
)
or "OpenCV not installed. Disabling OCR" in message
) )
return not health_check
class TraceContextFormatter(JsonFormatter): class TraceContextFormatter(JsonFormatter):
""" """
+1
View File
@@ -389,6 +389,7 @@ async def _index_document(
"user_id": doc_task.user_id, "user_id": doc_task.user_id,
"doc_id": doc_task.doc_id, "doc_id": doc_task.doc_id,
"doc_type": doc_task.doc_type, "doc_type": doc_task.doc_type,
"is_placeholder": False, # Real indexed document (not placeholder)
"title": title, "title": title,
"excerpt": chunk.text[:200], "excerpt": chunk.text[:200],
"indexed_at": indexed_at, "indexed_at": indexed_at,
+41
View File
@@ -0,0 +1,41 @@
import logging
import pathlib
import anyio
import pymupdf
import pymupdf.layout
from nextcloud_mcp_server.client import NextcloudClient
pymupdf.layout.activate()
import pymupdf4llm # noqa: E402
client = NextcloudClient.from_env()
logger = logging.getLogger(__name__)
TMP_DIR = pathlib.Path("/tmp/tmp-images")
TMP_DIR.mkdir(exist_ok=True, parents=True)
async def print_markdown(filename):
content, _ = await client.webdav.read_file(filename)
doc = pymupdf.open("pdf", content)
md_text = pymupdf4llm.to_markdown(doc, write_images=True, image_path=str(TMP_DIR))
print(md_text)
async def run1():
response = await client.webdav.find_by_type("application/pdf")
# print(response)
for file in response:
await print_markdown(file["path"])
async def run():
tags = await client.tags.get_all_tags()
print(tags)
if __name__ == "__main__":
logging.basicConfig(level="INFO")
anyio.run(run)