fix: Set is_placeholder=False in processor to fix search filtering

The processor was not setting is_placeholder field when writing real
document chunks to Qdrant. This caused the placeholder filter to exclude
all documents (since None != False), resulting in 0 search results.

Now explicitly sets is_placeholder: False in payload when writing real
indexed chunks, allowing search filters to correctly distinguish between
placeholders and real documents.
This commit is contained in:
Chris Coutinho
2025-11-20 17:14:13 +01:00
parent 25ef33de7f
commit 5a251a99e6
3 changed files with 52 additions and 11 deletions
@@ -37,19 +37,18 @@ class HealthCheckFilter(logging.Filter):
"""
# Check if the log message contains health check endpoints
message = record.getMessage()
return (
not any(
endpoint in message
for endpoint in [
"/health/live",
"/health/ready",
"/metrics",
"/app/vector-sync/status",
]
)
or "OpenCV not installed. Disabling OCR" in message
health_check = any(
endpoint in message
for endpoint in [
"/health/live",
"/health/ready",
"/metrics",
"/app/vector-sync/status",
]
)
return not health_check
class TraceContextFormatter(JsonFormatter):
"""
+1
View File
@@ -389,6 +389,7 @@ async def _index_document(
"user_id": doc_task.user_id,
"doc_id": doc_task.doc_id,
"doc_type": doc_task.doc_type,
"is_placeholder": False, # Real indexed document (not placeholder)
"title": title,
"excerpt": chunk.text[:200],
"indexed_at": indexed_at,
+41
View File
@@ -0,0 +1,41 @@
import logging
import pathlib
import anyio
import pymupdf
import pymupdf.layout
from nextcloud_mcp_server.client import NextcloudClient
pymupdf.layout.activate()
import pymupdf4llm # noqa: E402
client = NextcloudClient.from_env()
logger = logging.getLogger(__name__)
TMP_DIR = pathlib.Path("/tmp/tmp-images")
TMP_DIR.mkdir(exist_ok=True, parents=True)
async def print_markdown(filename):
content, _ = await client.webdav.read_file(filename)
doc = pymupdf.open("pdf", content)
md_text = pymupdf4llm.to_markdown(doc, write_images=True, image_path=str(TMP_DIR))
print(md_text)
async def run1():
response = await client.webdav.find_by_type("application/pdf")
# print(response)
for file in response:
await print_markdown(file["path"])
async def run():
tags = await client.tags.get_all_tags()
print(tags)
if __name__ == "__main__":
logging.basicConfig(level="INFO")
anyio.run(run)