From ec2c274cd93ca4a8184d502716ec92588d3eb390 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 20 Nov 2025 15:36:22 +0100 Subject: [PATCH] fix: Increase placeholder staleness threshold to 5x scan interval - Changed from 2x (120s) to 5x (300s) scan interval - Large PDFs take 3-4 minutes to process, need longer threshold - Prevents premature requeuing of in-flight documents --- .../observability/logging_config.py | 19 +++++++++++-------- nextcloud_mcp_server/vector/scanner.py | 10 ++++++---- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/nextcloud_mcp_server/observability/logging_config.py b/nextcloud_mcp_server/observability/logging_config.py index 15e43fb..0af7dfa 100644 --- a/nextcloud_mcp_server/observability/logging_config.py +++ b/nextcloud_mcp_server/observability/logging_config.py @@ -37,14 +37,17 @@ class HealthCheckFilter(logging.Filter): """ # Check if the log message contains health check endpoints message = record.getMessage() - return not any( - endpoint in message - for endpoint in [ - "/health/live", - "/health/ready", - "/metrics", - "/app/vector-sync/status", - ] + return ( + not any( + endpoint in message + for endpoint in [ + "/health/live", + "/health/ready", + "/metrics", + "/app/vector-sync/status", + ] + ) + or "OpenCV not installed. Disabling OCR" in message ) diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py index d0e7607..afc4e32 100644 --- a/nextcloud_mcp_server/vector/scanner.py +++ b/nextcloud_mcp_server/vector/scanner.py @@ -266,10 +266,11 @@ async def scan_user_documents( needs_indexing = True elif existing_metadata.get("is_placeholder", False): # Placeholder exists - check if it's stale (processing may have failed) - # Only requeue if placeholder is older than 2x scan interval + # Only requeue if placeholder is older than 5x scan interval + # (Large PDFs can take 3-4 minutes to process) queued_at = existing_metadata.get("queued_at", 0) placeholder_age = time.time() - queued_at - stale_threshold = get_settings().vector_sync_scan_interval * 2 + stale_threshold = get_settings().vector_sync_scan_interval * 5 if placeholder_age > stale_threshold: logger.debug( f"Found stale placeholder for note {doc_id} " @@ -460,10 +461,11 @@ async def scan_user_documents( needs_indexing = True elif existing_metadata.get("is_placeholder", False): # Placeholder exists - check if it's stale (processing may have failed) - # Only requeue if placeholder is older than 2x scan interval + # Only requeue if placeholder is older than 5x scan interval + # (Large PDFs can take 3-4 minutes to process) queued_at = existing_metadata.get("queued_at", 0) placeholder_age = time.time() - queued_at - stale_threshold = get_settings().vector_sync_scan_interval * 2 + stale_threshold = get_settings().vector_sync_scan_interval * 5 if placeholder_age > stale_threshold: logger.debug( f"Found stale placeholder for file {file_path} (ID: {file_id}) "