fix: Set is_placeholder=False in processor to fix search filtering
The processor was not setting is_placeholder field when writing real document chunks to Qdrant. This caused the placeholder filter to exclude all documents (since None != False), resulting in 0 search results. Now explicitly sets is_placeholder: False in payload when writing real indexed chunks, allowing search filters to correctly distinguish between placeholders and real documents.
This commit is contained in:
@@ -37,19 +37,18 @@ class HealthCheckFilter(logging.Filter):
|
|||||||
"""
|
"""
|
||||||
# Check if the log message contains health check endpoints
|
# Check if the log message contains health check endpoints
|
||||||
message = record.getMessage()
|
message = record.getMessage()
|
||||||
return (
|
health_check = any(
|
||||||
not any(
|
endpoint in message
|
||||||
endpoint in message
|
for endpoint in [
|
||||||
for endpoint in [
|
"/health/live",
|
||||||
"/health/live",
|
"/health/ready",
|
||||||
"/health/ready",
|
"/metrics",
|
||||||
"/metrics",
|
"/app/vector-sync/status",
|
||||||
"/app/vector-sync/status",
|
]
|
||||||
]
|
|
||||||
)
|
|
||||||
or "OpenCV not installed. Disabling OCR" in message
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return not health_check
|
||||||
|
|
||||||
|
|
||||||
class TraceContextFormatter(JsonFormatter):
|
class TraceContextFormatter(JsonFormatter):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -389,6 +389,7 @@ async def _index_document(
|
|||||||
"user_id": doc_task.user_id,
|
"user_id": doc_task.user_id,
|
||||||
"doc_id": doc_task.doc_id,
|
"doc_id": doc_task.doc_id,
|
||||||
"doc_type": doc_task.doc_type,
|
"doc_type": doc_task.doc_type,
|
||||||
|
"is_placeholder": False, # Real indexed document (not placeholder)
|
||||||
"title": title,
|
"title": title,
|
||||||
"excerpt": chunk.text[:200],
|
"excerpt": chunk.text[:200],
|
||||||
"indexed_at": indexed_at,
|
"indexed_at": indexed_at,
|
||||||
|
|||||||
@@ -0,0 +1,41 @@
|
|||||||
|
import logging
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import anyio
|
||||||
|
import pymupdf
|
||||||
|
import pymupdf.layout
|
||||||
|
|
||||||
|
from nextcloud_mcp_server.client import NextcloudClient
|
||||||
|
|
||||||
|
pymupdf.layout.activate()
|
||||||
|
import pymupdf4llm # noqa: E402
|
||||||
|
|
||||||
|
client = NextcloudClient.from_env()
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TMP_DIR = pathlib.Path("/tmp/tmp-images")
|
||||||
|
TMP_DIR.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def print_markdown(filename):
|
||||||
|
content, _ = await client.webdav.read_file(filename)
|
||||||
|
doc = pymupdf.open("pdf", content)
|
||||||
|
md_text = pymupdf4llm.to_markdown(doc, write_images=True, image_path=str(TMP_DIR))
|
||||||
|
print(md_text)
|
||||||
|
|
||||||
|
|
||||||
|
async def run1():
|
||||||
|
response = await client.webdav.find_by_type("application/pdf")
|
||||||
|
# print(response)
|
||||||
|
for file in response:
|
||||||
|
await print_markdown(file["path"])
|
||||||
|
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
tags = await client.tags.get_all_tags()
|
||||||
|
print(tags)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level="INFO")
|
||||||
|
anyio.run(run)
|
||||||
Reference in New Issue
Block a user