fix: Centralize PDF processing and generate separate images per chunk

Previously, pymupdf4llm.to_markdown() was called twice - once in PyMuPDFProcessor during indexing and again in PDFHighlighter during visualization. Different image path lengths caused different character offsets, leading to highlighted pages not matching their chunks. Also fixed issue where all chunks on the same page showed all highlights instead of just their own highlight. Now restores original page contents between chunks using xref stream caching. Changes: - Add PDFHighlighter class requiring pre-computed page_boundaries and full_text from document processor (no fallback extraction) - Pass pre-computed data from processor to highlighter - Extract page-relative portion of chunk text for cross-page chunks - Add bounding box highlighting using text anchor search - Run highlight generation in parallel with embedding/BM25 - Cache and restore page contents to isolate highlights per chunk Results: Highlighting success rate improved from 51% to 95% (121/128). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 02:42:28 +01:00
parent a62a007c87
commit fffe483c02
8 changed files with 1211 additions and 113 deletions
@@ -256,115 +256,265 @@ async def _index_document(
    settings = get_settings()

    # Fetch document content
-    if doc_task.doc_type == "note":
-        document = await nc_client.notes.get_note(int(doc_task.doc_id))
-        content = f"{document['title']}\n\n{document['content']}"
-        title = document["title"]
-        etag = document.get("etag", "")
-        file_metadata = {}  # No file-specific metadata for notes
-        file_path = None  # Notes don't have file paths
-    elif doc_task.doc_type == "file":
-        # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
-        if not doc_task.file_path:
-            raise ValueError(
-                f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
-            )
-        file_path = doc_task.file_path
-
-        # Read file content via WebDAV
-        content_bytes, content_type = await nc_client.webdav.read_file(file_path)
-
-        # Use document processor registry to extract text
-        from nextcloud_mcp_server.document_processors import get_registry
-
-        registry = get_registry()
-
-        try:
-            result = await registry.process(
-                content=content_bytes,
-                content_type=content_type,
-                filename=file_path,
-            )
-            content = result.text
-            file_metadata = result.metadata
-            title = file_metadata.get("title") or file_path.split("/")[-1]
-            etag = ""  # WebDAV read_file doesn't return etag
-
-            # Diagnostic: Log page boundary information if available
-            if "page_boundaries" in file_metadata:
-                page_boundaries = file_metadata["page_boundaries"]
-                logger.info(
-                    f"Page boundaries for {file_path}: "
-                    f"{len(page_boundaries)} pages, text length: {len(content)}"
+    with trace_operation(
+        "vector_sync.fetch_content",
+        attributes={
+            "vector_sync.doc_type": doc_task.doc_type,
+            "vector_sync.doc_id": doc_task.doc_id,
+        },
+    ):
+        if doc_task.doc_type == "note":
+            document = await nc_client.notes.get_note(int(doc_task.doc_id))
+            content = f"{document['title']}\n\n{document['content']}"
+            title = document["title"]
+            etag = document.get("etag", "")
+            file_metadata = {}  # No file-specific metadata for notes
+            file_path = None  # Notes don't have file paths
+            content_bytes = None  # Notes don't have binary content
+            content_type = None
+        elif doc_task.doc_type == "file":
+            # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
+            if not doc_task.file_path:
+                raise ValueError(
+                    f"File path required for file indexing but not provided (file_id={doc_task.doc_id})"
                )
-                # Log first 3 page boundaries for debugging
-                for boundary in page_boundaries[:3]:
-                    logger.debug(
-                        f"  Page {boundary['page']}: "
-                        f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]"
+            file_path = doc_task.file_path
+
+            # Read file content via WebDAV
+            content_bytes, content_type = await nc_client.webdav.read_file(file_path)
+        else:
+            raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
+
+    # Process file content (text extraction)
+    if doc_task.doc_type == "file":
+        # Type narrowing: content_bytes and content_type are set for files
+        assert content_bytes is not None
+        assert content_type is not None
+        assert file_path is not None
+
+        with trace_operation(
+            "vector_sync.document_process",
+            attributes={
+                "vector_sync.content_type": content_type,
+                "vector_sync.file_size": len(content_bytes),
+            },
+        ):
+            # Use document processor registry to extract text
+            from nextcloud_mcp_server.document_processors import get_registry
+
+            registry = get_registry()
+
+            try:
+                result = await registry.process(
+                    content=content_bytes,
+                    content_type=content_type,
+                    filename=file_path,
+                )
+                content = result.text
+                file_metadata = result.metadata
+                title = file_metadata.get("title") or file_path.split("/")[-1]
+                etag = ""  # WebDAV read_file doesn't return etag
+
+                # Diagnostic: Log page boundary information if available
+                if "page_boundaries" in file_metadata:
+                    page_boundaries = file_metadata["page_boundaries"]
+                    logger.info(
+                        f"Page boundaries for {file_path}: "
+                        f"{len(page_boundaries)} pages, text length: {len(content)}"
                    )
-                # Verify last boundary matches text length
-                if page_boundaries:
-                    last_boundary = page_boundaries[-1]
-                    if last_boundary["end_offset"] != len(content):
-                        logger.warning(
-                            f"Text length mismatch: content={len(content)}, "
-                            f"last_boundary_end={last_boundary['end_offset']}"
+                    # Log first 3 page boundaries for debugging
+                    for boundary in page_boundaries[:3]:
+                        logger.debug(
+                            f"  Page {boundary['page']}: "
+                            f"offsets [{boundary['start_offset']}:{boundary['end_offset']}]"
                        )
-            else:
-                logger.debug(f"No page_boundaries in metadata for {file_path}")
-        except Exception as e:
-            logger.error(f"Failed to process file {file_path}: {e}")
-            raise
-    else:
-        raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
+                    # Verify last boundary matches text length
+                    if page_boundaries:
+                        last_boundary = page_boundaries[-1]
+                        if last_boundary["end_offset"] != len(content):
+                            logger.warning(
+                                f"Text length mismatch: content={len(content)}, "
+                                f"last_boundary_end={last_boundary['end_offset']}"
+                            )
+                else:
+                    logger.debug(f"No page_boundaries in metadata for {file_path}")
+            except Exception as e:
+                logger.error(f"Failed to process file {file_path}: {e}")
+                raise

    # Tokenize and chunk (using configured chunk size and overlap)
-    chunker = DocumentChunker(
-        chunk_size=settings.document_chunk_size,
-        overlap=settings.document_chunk_overlap,
-    )
-    chunks = await chunker.chunk_text(content)
+    with trace_operation(
+        "vector_sync.chunk_text",
+        attributes={
+            "vector_sync.input_chars": len(content),
+            "vector_sync.chunk_size": settings.document_chunk_size,
+            "vector_sync.overlap": settings.document_chunk_overlap,
+        },
+    ):
+        chunker = DocumentChunker(
+            chunk_size=settings.document_chunk_size,
+            overlap=settings.document_chunk_overlap,
+        )
+        chunks = await chunker.chunk_text(content)

    # Assign page numbers to chunks if page boundaries are available (PDFs)
    if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
-        assign_page_numbers(chunks, file_metadata["page_boundaries"])
+        with trace_operation(
+            "vector_sync.assign_page_numbers",
+            attributes={
+                "vector_sync.chunk_count": len(chunks),
+                "vector_sync.page_count": len(file_metadata["page_boundaries"]),
+            },
+        ):
+            assign_page_numbers(chunks, file_metadata["page_boundaries"])

-        # Diagnostic: Verify page number assignment
-        assigned_count = sum(1 for c in chunks if c.page_number is not None)
-        logger.info(
-            f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks "
-            f"for {file_path}"
-        )
-
-        # Log first 3 chunks to see their page assignments
-        for i, chunk in enumerate(chunks[:3]):
-            logger.debug(
-                f"  Chunk {i}: page={chunk.page_number}, "
-                f"offsets=[{chunk.start_offset}:{chunk.end_offset}]"
+            # Diagnostic: Verify page number assignment
+            assigned_count = sum(1 for c in chunks if c.page_number is not None)
+            logger.info(
+                f"Assigned page numbers to {assigned_count}/{len(chunks)} chunks "
+                f"for {file_path}"
            )

-        # Warning if NO page numbers were assigned
-        if assigned_count == 0:
-            logger.warning(
-                f"NO page numbers assigned! "
-                f"Text length: {len(content)}, "
-                f"Chunks: {len(chunks)}, "
-                f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
-                f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
-                f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
-            )
+            # Log first 3 chunks to see their page assignments
+            for i, chunk in enumerate(chunks[:3]):
+                logger.debug(
+                    f"  Chunk {i}: page={chunk.page_number}, "
+                    f"offsets=[{chunk.start_offset}:{chunk.end_offset}]"
+                )
+
+            # Warning if NO page numbers were assigned
+            if assigned_count == 0:
+                logger.warning(
+                    f"NO page numbers assigned! "
+                    f"Text length: {len(content)}, "
+                    f"Chunks: {len(chunks)}, "
+                    f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
+                    f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
+                    f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
+                )

    # Extract chunk texts for embedding
    chunk_texts = [chunk.text for chunk in chunks]

-    # Generate dense embeddings (I/O bound - external API call)
-    embedding_service = get_embedding_service()
-    dense_embeddings = await embedding_service.embed_batch(chunk_texts)
+    # Initialize results containers
+    dense_embeddings: list = []
+    sparse_embeddings: list = []
+    chunk_images: dict[int, dict] = {}

-    # Generate sparse embeddings (BM25 for keyword matching)
-    bm25_service = get_bm25_service()
-    sparse_embeddings = await bm25_service.encode_batch(chunk_texts)
+    # Determine if we need PDF highlighting
+    is_pdf = doc_task.doc_type == "file" and content_type == "application/pdf"
+
+    # Define async tasks for parallel execution
+    async def generate_dense_embeddings():
+        """Generate dense embeddings (I/O bound - external API call)."""
+        nonlocal dense_embeddings
+        with trace_operation(
+            "vector_sync.embed_dense",
+            attributes={
+                "vector_sync.chunk_count": len(chunk_texts),
+                "vector_sync.total_chars": sum(len(t) for t in chunk_texts),
+            },
+        ):
+            embedding_service = get_embedding_service()
+            dense_embeddings = await embedding_service.embed_batch(chunk_texts)
+
+    async def generate_sparse_embeddings():
+        """Generate sparse embeddings (BM25 for keyword matching)."""
+        nonlocal sparse_embeddings
+        with trace_operation(
+            "vector_sync.embed_sparse",
+            attributes={
+                "vector_sync.chunk_count": len(chunk_texts),
+            },
+        ):
+            bm25_service = get_bm25_service()
+            sparse_embeddings = await bm25_service.encode_batch(chunk_texts)
+
+    async def generate_highlights():
+        """Generate highlighted page images for PDF chunks (CPU-bound)."""
+        nonlocal chunk_images
+        if not is_pdf:
+            return
+
+        # Type narrowing: content_bytes is set for PDF files
+        assert content_bytes is not None
+
+        with trace_operation(
+            "vector_sync.generate_highlights",
+            attributes={
+                "vector_sync.chunk_count": len(chunks),
+                "vector_sync.pdf_size": len(content_bytes),
+            },
+        ):
+            import base64
+
+            from nextcloud_mcp_server.search.pdf_highlighter import PDFHighlighter
+
+            # Build chunk data for batch processing
+            # Format: (chunk_index, start_offset, end_offset, page_number, chunk_text)
+            chunk_data: list[tuple[int, int, int, int | None, str]] = [
+                (i, chunk.start_offset, chunk.end_offset, chunk.page_number, chunk.text)
+                for i, chunk in enumerate(chunks)
+                if chunk.page_number is not None
+            ]
+
+            # Get pre-computed page boundaries from document processor
+            page_boundaries = file_metadata.get("page_boundaries")
+            if not page_boundaries:
+                logger.warning("No page boundaries available, skipping highlighting")
+                return
+
+            logger.info(
+                f"Batch generating highlighted page images for {len(chunk_data)} PDF chunks"
+            )
+
+            # Run CPU-bound highlighting in thread pool
+            # Pass pre-computed page boundaries and full text to avoid re-processing the PDF
+            batch_results = await anyio.to_thread.run_sync(  # type: ignore[attr-defined]
+                lambda: PDFHighlighter.highlight_chunks_batch(
+                    pdf_bytes=content_bytes,
+                    chunks=chunk_data,
+                    page_boundaries=page_boundaries,
+                    full_text=content,
+                    color="yellow",
+                    zoom=2.0,
+                )
+            )
+
+            # Convert results to storage format
+            for chunk_index, (
+                png_bytes,
+                actual_page_num,
+                highlight_count,
+            ) in batch_results.items():
+                image_base64 = base64.b64encode(png_bytes).decode("utf-8")
+                chunk_images[chunk_index] = {
+                    "image": image_base64,
+                    "page": actual_page_num,
+                    "highlights": highlight_count,
+                    "size": len(png_bytes),
+                }
+
+            logger.info(
+                f"Generated {len(chunk_images)}/{len(chunks)} highlighted page images "
+                f"(avg {sum(img['size'] for img in chunk_images.values()) // max(len(chunk_images), 1):,} bytes)"
+            )
+
+    # Run all embedding/highlighting operations in parallel
+    # - Dense embeddings: I/O bound (API call)
+    # - Sparse embeddings: CPU bound (local BM25)
+    # - Highlighting: CPU bound (PyMuPDF rendering, runs in thread pool)
+    with trace_operation(
+        "vector_sync.parallel_processing",
+        attributes={
+            "vector_sync.is_pdf": is_pdf,
+            "vector_sync.chunk_count": len(chunks),
+        },
+    ):
+        async with anyio.create_task_group() as tg:
+            tg.start_soon(generate_dense_embeddings)
+            tg.start_soon(generate_sparse_embeddings)
+            tg.start_soon(generate_highlights)

    # Prepare Qdrant points
    indexed_at = int(time.time())
@@ -416,6 +566,16 @@ async def _index_document(
                        if doc_task.doc_type == "file"
                        else {}
                    ),
+                    # Highlighted page image (PDF only)
+                    **(
+                        {
+                            "highlighted_page_image": chunk_images[i]["image"],
+                            "highlighted_page_number": chunk_images[i]["page"],
+                            "highlight_count": chunk_images[i]["highlights"],
+                        }
+                        if i in chunk_images
+                        else {}
+                    ),
                },
            )
        )
@@ -434,12 +594,30 @@ async def _index_document(
            f"Failed to delete placeholder for {doc_task.doc_type}_{doc_task.doc_id}: {e}"
        )

-    # Upsert to Qdrant
-    await qdrant_client.upsert(
-        collection_name=settings.get_collection_name(),
-        points=points,
-        wait=True,
-    )
+    # Upsert to Qdrant in batches to avoid timeout with large payloads
+    # Each batch is limited to avoid WriteTimeout when sending large image payloads
+    BATCH_SIZE = 10  # ~2MB per batch with images
+    with trace_operation(
+        "vector_sync.qdrant_upsert",
+        attributes={
+            "vector_sync.point_count": len(points),
+            "vector_sync.collection": settings.get_collection_name(),
+            "vector_sync.images_count": len(chunk_images),
+            "vector_sync.batch_size": BATCH_SIZE,
+        },
+    ):
+        for batch_start in range(0, len(points), BATCH_SIZE):
+            batch_end = min(batch_start + BATCH_SIZE, len(points))
+            batch = points[batch_start:batch_end]
+            await qdrant_client.upsert(
+                collection_name=settings.get_collection_name(),
+                points=batch,
+                wait=True,
+            )
+            if batch_end < len(points):
+                logger.debug(
+                    f"Upserted batch {batch_start // BATCH_SIZE + 1}/{(len(points) + BATCH_SIZE - 1) // BATCH_SIZE}"
+                )

    logger.info(
        f"Indexed {doc_task.doc_type}_{doc_task.doc_id} for {doc_task.user_id} "