nextcloud-mcp-server/nextcloud_mcp_server/search/pdf_highlighter.py

"""PDF chunk highlighting utilities for vector visualization.

This module provides utilities to generate highlighted page images showing
matched chunks and their context from semantic search results.

The highlighting uses character offsets to precisely locate chunks within
PDF documents, ensuring accurate highlighting even when text formatting
varies between indexing and rendering.
"""

import logging
import re
from typing import Optional

import pymupdf
import pymupdf4llm

logger = logging.getLogger(__name__)


class PDFHighlighter:
    """Generate highlighted page images from PDF chunks."""

    # Color definitions (RGB, 0-1 range)
    COLORS = {
        "yellow": [1, 1, 0],
        "red": [1, 0, 0],
        "green": [0, 1, 0],
        "blue": [0, 0, 1],
        "orange": [1, 0.5, 0],
        "pink": [1, 0, 1],
        "gray": [0.7, 0.7, 0.7],
        "light_blue": [0.7, 0.9, 1.0],
        "light_green": [0.7, 1.0, 0.7],
    }

    @staticmethod
    def strip_markdown(text: str) -> str:
        """Remove markdown formatting to improve search accuracy.

        Args:
            text: Text with potential markdown formatting

        Returns:
            Plain text with markdown removed
        """
        # Remove bold/italic markers
        text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
        text = re.sub(r"\*(.+?)\*", r"\1", text)
        text = re.sub(r"__(.+?)__", r"\1", text)
        text = re.sub(r"_(.+?)_", r"\1", text)

        # Remove headers
        text = re.sub(r"^#+\s+", "", text, flags=re.MULTILINE)

        # Remove inline code
        text = re.sub(r"`(.+?)`", r"\1", text)

        return text.strip()

    @staticmethod
    def extract_pdf_text_with_boundaries(
        pdf_doc: pymupdf.Document,
    ) -> tuple[str, list[dict]]:
        """Extract full document text with page boundary tracking.

        Uses pymupdf4llm.to_markdown() for consistency with indexing.

        IMPORTANT: Must use write_images=True to match PyMuPDFProcessor behavior!
        Even though we don't need the images, we need the image references in the
        markdown text to maintain consistent character offsets with indexing.

        Args:
            pdf_doc: Open PyMuPDF document

        Returns:
            Tuple of (full_text, page_boundaries) where page_boundaries is a list of:
            {"page": 1, "start_offset": 0, "end_offset": 1234}
        """
        import tempfile
        from pathlib import Path

        page_boundaries = []
        text_parts = []
        current_offset = 0

        # Use temp directory for image output (images are discarded after extraction)
        temp_dir = Path(tempfile.mkdtemp(prefix="pdf_highlight_"))

        for page_idx in range(pdf_doc.page_count):
            page_md = pymupdf4llm.to_markdown(
                pdf_doc,
                pages=[page_idx],
                write_images=True,  # Must match indexing! Otherwise offsets misalign
                image_path=temp_dir,
                page_chunks=False,
            )

            page_boundaries.append(
                {
                    "page": page_idx + 1,  # 1-indexed
                    "start_offset": current_offset,
                    "end_offset": current_offset + len(page_md),
                }
            )

            text_parts.append(page_md)
            current_offset += len(page_md)

        full_text = "".join(text_parts)

        # Clean up temp directory and extracted images
        import shutil

        try:
            shutil.rmtree(temp_dir)
        except Exception as e:
            logger.warning(f"Failed to clean up temp directory {temp_dir}: {e}")

        return full_text, page_boundaries

    @staticmethod
    def find_chunk_page(
        chunk_start_offset: int,
        chunk_end_offset: int,
        page_boundaries: list[dict],
    ) -> Optional[dict]:
        """Find which page contains the most of a given chunk.

        Args:
            chunk_start_offset: Chunk start position in full document
            chunk_end_offset: Chunk end position in full document
            page_boundaries: Page boundary list from extract_pdf_text_with_boundaries()

        Returns:
            Dict with keys: page_num, overlap_chars, page_relative_start, page_relative_end
            or None if chunk not found on any page
        """
        chunk_pages = []

        for boundary in page_boundaries:
            page_start = boundary["start_offset"]
            page_end = boundary["end_offset"]

            # Check if chunk overlaps with this page
            if chunk_start_offset < page_end and chunk_end_offset > page_start:
                overlap_start = max(chunk_start_offset, page_start)
                overlap_end = min(chunk_end_offset, page_end)
                overlap_chars = overlap_end - overlap_start

                chunk_pages.append(
                    {
                        "page_num": boundary["page"],
                        "overlap_chars": overlap_chars,
                        "page_relative_start": overlap_start - page_start,
                        "page_relative_end": overlap_end - page_start,
                    }
                )

        if not chunk_pages:
            return None

        # Return page with maximum overlap
        return max(chunk_pages, key=lambda p: p["overlap_chars"])

    @staticmethod
    def highlight_chunk_by_word_positions(
        page: pymupdf.Page,
        chunk_text: str,
        color: str = "yellow",
        search_region: tuple[float, float, float, float] | None = None,
    ) -> int:
        """Highlight chunk using word-position matching.

        This method matches words from the chunk to their positions on the PDF page,
        avoiding text search mismatches between markdown-formatted text and raw PDF text.

        Args:
            page: PyMuPDF page object
            chunk_text: Text to highlight (may contain markdown)
            color: Color name from COLORS dict
            search_region: Optional (x0, y0, x1, y1) bounding box to constrain search.
                          If provided, only words within this region are considered.

        Returns:
            Number of highlight rectangles added
        """
        # Tokenize chunk into words (alphanumeric only, lowercase)
        chunk_words = re.findall(
            r"\w+", PDFHighlighter.strip_markdown(chunk_text).lower()
        )

        if not chunk_words:
            logger.warning("No words found in chunk text")
            return 0

        # Get all words from page with positions
        # Format: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
        try:
            page_words = page.get_text("words")
        except Exception as e:
            logger.error(f"Failed to extract words from page: {e}")
            return 0

        if not page_words:
            logger.warning("No words found on page")
            return 0

        # Filter words by search region if provided
        if search_region:
            rx0, ry0, rx1, ry1 = search_region
            # Allow some tolerance (10 points) for words near region boundary
            tolerance = 10
            page_words = [
                w
                for w in page_words
                if (
                    w[0] >= rx0 - tolerance
                    and w[2] <= rx1 + tolerance
                    and w[1] >= ry0 - tolerance
                    and w[3] <= ry1 + tolerance
                )
            ]
            logger.debug(
                f"Filtered to {len(page_words)} words in region "
                f"({rx0:.0f}, {ry0:.0f}, {rx1:.0f}, {ry1:.0f})"
            )

        if not page_words:
            logger.warning("No words found in search region")
            return 0

        # Find matching word sequence - use FIRST match, not longest
        # This ensures we highlight the actual chunk location, not similar text elsewhere
        matches = []

        # Build a simple word-to-positions index for the first few chunk words
        # to find candidate starting positions
        first_chunk_word = chunk_words[0] if chunk_words else ""
        candidate_starts = []

        for i, pw in enumerate(page_words):
            page_word = pw[4].lower()
            # Check if this could be the start of the chunk
            if (
                first_chunk_word == page_word
                or first_chunk_word in page_word
                or page_word in first_chunk_word
            ):
                candidate_starts.append(i)

        # Try each candidate start position and take the FIRST good match
        for start_pos in candidate_starts:
            current_matches = []
            chunk_idx = 0
            skip_count = 0
            max_skips = 3  # Allow some formatting differences

            for page_idx in range(start_pos, len(page_words)):
                if chunk_idx >= len(chunk_words):
                    break

                page_word = page_words[page_idx][4].lower()
                chunk_word = chunk_words[chunk_idx]

                # Check for match (allow partial matches for flexibility)
                if (
                    chunk_word == page_word
                    or chunk_word in page_word
                    or page_word in chunk_word
                ):
                    current_matches.append(page_words[page_idx])
                    chunk_idx += 1
                    skip_count = 0
                elif skip_count < max_skips:
                    # Allow skipping some words (formatting, punctuation)
                    skip_count += 1
                    continue
                else:
                    break

            # Accept if we matched at least 50% of chunk words
            if len(current_matches) >= len(chunk_words) * 0.5:
                matches = current_matches
                logger.debug(
                    f"Found match at position {start_pos}: "
                    f"{len(matches)}/{len(chunk_words)} words"
                )
                break  # Take FIRST match, not best/longest

        if not matches:
            logger.debug(f"No word matches found (chunk has {len(chunk_words)} words)")
            return 0

        logger.debug(
            f"Matched {len(matches)} words out of {len(chunk_words)} chunk words"
        )

        # Build rectangles from matched words
        rects = [pymupdf.Rect(w[0], w[1], w[2], w[3]) for w in matches]

        # Check if matches are contiguous (not scattered across the page)
        # Scattered matches indicate false positives from common words
        if len(rects) > 1:
            # Sort by vertical position then horizontal
            sorted_matches = sorted(matches, key=lambda w: (round(w[1]), w[0]))

            # Check for large vertical gaps (more than ~2 lines apart)
            # A typical line height is 12-20 points
            max_line_gap = 50  # Points - allows for ~2-3 lines gap
            prev_y = sorted_matches[0][1]
            large_gaps = 0

            for match in sorted_matches[1:]:
                y_gap = match[1] - prev_y
                if y_gap > max_line_gap:
                    large_gaps += 1
                prev_y = match[1]

            # If matches are scattered (many large gaps), reject this match
            # A chunk should be mostly contiguous text
            if large_gaps > len(matches) * 0.3:  # More than 30% have gaps
                logger.debug(
                    f"Rejecting scattered matches: {large_gaps} large gaps "
                    f"out of {len(matches)} matches"
                )
                return 0

        # Merge adjacent rectangles on the same line for cleaner highlighting
        merged_rects = []
        sorted_rects = sorted(rects, key=lambda r: (round(r.y0), r.x0))

        current_rect = None
        for rect in sorted_rects:
            if current_rect is None:
                current_rect = rect
            elif abs(rect.y0 - current_rect.y0) < 5:  # Same line (within 5 points)
                current_rect = current_rect | rect  # Union
            else:
                merged_rects.append(current_rect)
                current_rect = rect

        if current_rect:
            merged_rects.append(current_rect)

        # Add highlights
        rgb = PDFHighlighter.COLORS.get(color, PDFHighlighter.COLORS["yellow"])
        for rect in merged_rects:
            highlight = page.add_highlight_annot(rect)
            highlight.set_colors({"stroke": rgb})
            highlight.set_info(
                content="Chunk from semantic search",
                title="PDF Highlighter (word-position)",
            )
            highlight.update()

        return len(merged_rects)

    @staticmethod
    def find_unique_phrase(
        text: str, min_len: int = 30, max_len: int = 80
    ) -> str | None:
        """Find a relatively unique phrase from text for location search.

        Looks for phrases that are likely to be unique on the page:
        - Prefers phrases with numbers or special terms
        - Avoids very common words

        Args:
            text: Source text to extract phrase from
            min_len: Minimum phrase length
            max_len: Maximum phrase length

        Returns:
            A phrase likely to be unique, or None if not found
        """
        clean_text = PDFHighlighter.strip_markdown(text).strip()
        if not clean_text:
            return None

        # Try first sentence (often unique due to context)
        sentences = re.split(r"[.!?]\s+", clean_text)
        for sentence in sentences:
            sentence = sentence.strip()
            if min_len <= len(sentence) <= max_len:
                return sentence
            elif len(sentence) > max_len:
                return sentence[:max_len]

        # Fallback: first N chars
        if len(clean_text) >= min_len:
            return clean_text[:max_len]

        return clean_text if clean_text else None

    @staticmethod
    def highlight_chunk_on_page(
        page: pymupdf.Page,
        chunk_text: str,
        color: str = "yellow",
        page_relative_start: int | None = None,
        page_relative_end: int | None = None,
        page_text_length: int | None = None,
    ) -> int:
        """Add bounding box highlight to a PDF page for the given chunk text.

        Uses text search to find the chunk's location on the page, then draws
        a bounding box around that region. Falls back to character offset estimation
        if text search fails.

        Args:
            page: PyMuPDF page object
            chunk_text: Text to highlight (may contain markdown)
            color: Color name from COLORS dict
            page_relative_start: Character offset where chunk starts on page (optional)
            page_relative_end: Character offset where chunk ends on page (optional)
            page_text_length: Total character length of page text (optional)

        Returns:
            Number of highlights added (1 for bounding box, 0 if failed)
        """
        page_rect = page.rect
        rgb = PDFHighlighter.COLORS.get(color, PDFHighlighter.COLORS["yellow"])

        # Strip markdown for searching
        search_text = PDFHighlighter.strip_markdown(chunk_text)

        # Try to find chunk location using text search
        # Search for progressively shorter phrases until we find a match
        anchor_rect = None
        search_phrases = []

        # Build search phrases from chunk text
        sentences = re.split(r"[.!?]\s+", search_text)
        for sentence in sentences[:3]:  # Try first 3 sentences
            sentence = sentence.strip()
            if len(sentence) >= 20:
                search_phrases.append(sentence[:80])
                if len(sentence) >= 40:
                    search_phrases.append(sentence[:40])

        # Also try first N characters
        if len(search_text) >= 30:
            search_phrases.append(search_text[:60])
            search_phrases.append(search_text[:30])

        for phrase in search_phrases:
            if not phrase:
                continue
            rects = page.search_for(phrase.strip())
            if rects:
                anchor_rect = rects[0]  # Use first match
                logger.debug(f"Found chunk anchor using phrase: '{phrase[:30]}...'")
                break

        if not anchor_rect:
            page_num = page.number + 1 if page.number is not None else "unknown"
            logger.warning(f"Could not find chunk text on page {page_num}")
            return 0

        # Calculate chunk height based on character count
        # Estimate ~15 chars per line, ~12pt line height
        chunk_chars = len(search_text)
        estimated_lines = max(1, chunk_chars / 60)  # ~60 chars per line typical
        estimated_height = estimated_lines * 14  # ~14pt per line

        # Build bounding box starting from anchor
        chunk_rect = pymupdf.Rect(
            page_rect.x0 + 30,  # Left margin
            anchor_rect.y0 - 5,  # Start slightly above anchor
            page_rect.x1 - 30,  # Right margin
            min(
                anchor_rect.y0 + estimated_height + 10, page_rect.y1 - 30
            ),  # Estimated bottom
        )

        # Draw a visible rectangle around the chunk region
        shape = page.new_shape()
        shape.draw_rect(chunk_rect)
        shape.finish(
            color=rgb,  # Border color
            fill=None,  # No fill (transparent)
            width=2.5,  # Border width
            dashes="[4 2]",  # Dashed line
        )
        shape.commit()

        # Add semi-transparent fill for visibility
        fill_shape = page.new_shape()
        fill_shape.draw_rect(chunk_rect)
        fill_shape.finish(
            color=None,  # No border
            fill=[1, 1, 0.7],  # Light yellow fill
            fill_opacity=0.15,  # Very transparent
        )
        fill_shape.commit()

        logger.debug(
            f"Added bounding box at y={chunk_rect.y0:.0f}-{chunk_rect.y1:.0f} "
            f"(estimated {estimated_lines:.1f} lines)"
        )

        return 1

    @staticmethod
    def highlight_chunk(
        pdf_bytes: bytes,
        chunk_start_offset: int,
        chunk_end_offset: int,
        stored_page_number: Optional[int] = None,
        color: str = "yellow",
        zoom: float = 2.0,
    ) -> Optional[tuple[bytes, int, int]]:
        """Generate PNG image of PDF page with highlighted chunk.

        This is the main entry point for highlighting. It:
        1. Extracts document text with page boundaries
        2. Finds which page contains the chunk
        3. Extracts chunk text using character offsets
        4. Highlights the chunk on the page
        5. Renders page to PNG

        Args:
            pdf_bytes: PDF file bytes
            chunk_start_offset: Chunk start position (document-level)
            chunk_end_offset: Chunk end position (document-level)
            stored_page_number: Page number from metadata (optional, for validation)
            color: Highlight color name
            zoom: Rendering zoom factor (2.0 = 144 DPI)

        Returns:
            Tuple of (png_bytes, page_number, highlight_count) or None if failed
        """
        import tempfile
        from pathlib import Path

        temp_pdf_path = None
        try:
            # Write PDF to temp file with consistent name "pdf.pdf"
            # This ensures image references match indexing (e.g., pdf-0001.png)
            # Different temp filenames would cause different markdown text lengths!
            temp_dir = Path(tempfile.mkdtemp(prefix="pdf_highlight_"))
            temp_pdf_path = temp_dir / "pdf.pdf"
            temp_pdf_path.write_bytes(pdf_bytes)

            # Open PDF from temp file
            doc = pymupdf.open(temp_pdf_path)

            # Extract text with page boundaries
            full_text, page_boundaries = (
                PDFHighlighter.extract_pdf_text_with_boundaries(doc)
            )

            # Find which page contains the chunk
            chunk_page_info = PDFHighlighter.find_chunk_page(
                chunk_start_offset, chunk_end_offset, page_boundaries
            )

            if not chunk_page_info:
                logger.error("Chunk not found on any page")
                doc.close()
                return None

            page_num = chunk_page_info["page_num"]

            # Log if page differs from stored metadata
            if stored_page_number and stored_page_number != page_num:
                logger.info(
                    f"Chunk primarily on page {page_num}, metadata says {stored_page_number}"
                )

            # Extract page text
            page_boundary = page_boundaries[page_num - 1]
            page_start = page_boundary["start_offset"]
            page_end = page_boundary["end_offset"]
            page_text = full_text[page_start:page_end]

            # Extract chunk text using page-relative offsets
            page_relative_start = chunk_page_info["page_relative_start"]
            page_relative_end = chunk_page_info["page_relative_end"]
            chunk_text = page_text[page_relative_start:page_relative_end]

            # Calculate page text length for region estimation
            page_text_length = page_end - page_start

            logger.debug(
                f"Extracted {len(chunk_text)} chars on page {page_num} "
                f"(offsets {page_relative_start}-{page_relative_end} of {page_text_length})"
            )

            # Get page and add highlights
            page = doc[page_num - 1]
            highlight_count = PDFHighlighter.highlight_chunk_on_page(
                page,
                chunk_text,
                color,
                page_relative_start=page_relative_start,
                page_relative_end=page_relative_end,
                page_text_length=page_text_length,
            )

            if highlight_count == 0:
                logger.warning("No highlights added")
                doc.close()
                return None

            # Render page to PNG
            mat = pymupdf.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            png_bytes = pix.tobytes("png")

            doc.close()

            logger.info(
                f"Generated {len(png_bytes):,} byte image with {highlight_count} highlights"
            )

            return (png_bytes, page_num, highlight_count)

        except Exception as e:
            logger.error(f"Error highlighting chunk: {e}", exc_info=True)
            return None

        finally:
            # Clean up temp directory and PDF file
            if temp_pdf_path and temp_pdf_path.parent.exists():
                try:
                    import shutil

                    shutil.rmtree(temp_pdf_path.parent)
                except Exception as e:
                    logger.warning(
                        f"Failed to delete temp directory {temp_pdf_path.parent}: {e}"
                    )

    @staticmethod
    def highlight_chunks_batch(
        pdf_bytes: bytes,
        chunks: list[tuple[int, int, int, int | None, str]],
        page_boundaries: list[dict],
        full_text: str,
        color: str = "yellow",
        zoom: float = 2.0,
    ) -> dict[int, tuple[bytes, int, int]]:
        """Generate highlighted images for multiple chunks.

        Opens PDF once for rendering, uses pre-computed page boundaries from the
        document processor. This ensures consistent character offsets between
        chunking and highlighting.

        Args:
            pdf_bytes: PDF file bytes
            chunks: List of (chunk_index, start_offset, end_offset, stored_page_number, chunk_text)
                    The chunk_index is used as the key in the returned dict.
                    chunk_text is the actual text content of the chunk.
            page_boundaries: Pre-computed page boundaries from document processor.
                            Each entry: {"page": 1, "start_offset": 0, "end_offset": 1234}
            full_text: Full document text for extracting page-relative portions.
            color: Highlight color name
            zoom: Rendering zoom factor (2.0 = 144 DPI)

        Returns:
            Dict mapping chunk_index to (png_bytes, page_number, highlight_count)
            Chunks that fail to highlight are omitted from the result.
        """
        import shutil
        import tempfile
        from collections import defaultdict
        from pathlib import Path

        results: dict[int, tuple[bytes, int, int]] = {}

        if not chunks:
            return results

        temp_pdf_path = None
        try:
            # Write PDF to temp file
            temp_dir = Path(tempfile.mkdtemp(prefix="pdf_highlight_batch_"))
            temp_pdf_path = temp_dir / "pdf.pdf"
            temp_pdf_path.write_bytes(pdf_bytes)

            # Open PDF once (only for rendering, not text extraction)
            doc = pymupdf.open(temp_pdf_path)

            logger.debug(
                f"Batch highlighting: {len(chunks)} chunks, "
                f"{len(page_boundaries)} pages"
            )

            # Group chunks by their target page for efficient rendering
            # We'll render each page only once with all its highlights
            chunks_by_page: dict[int, list[tuple[int, dict, str]]] = defaultdict(list)

            for chunk_tuple in chunks:
                # Unpack chunk tuple - chunk_text is now passed directly
                chunk_index, start_offset, end_offset, stored_page_num, chunk_text = (
                    chunk_tuple
                )

                # Find which page contains this chunk
                chunk_page_info = PDFHighlighter.find_chunk_page(
                    start_offset, end_offset, page_boundaries
                )

                if not chunk_page_info:
                    logger.warning(f"Chunk {chunk_index}: not found on any page")
                    continue

                page_num = chunk_page_info["page_num"]

                # Log if page differs from stored metadata
                if stored_page_num and stored_page_num != page_num:
                    logger.debug(
                        f"Chunk {chunk_index}: found on page {page_num}, "
                        f"metadata says {stored_page_num}"
                    )

                # Extract page-relative portion of chunk text
                # This is critical for cross-page chunks where the start
                # of the chunk might be on a different page
                page_boundary = page_boundaries[page_num - 1]
                page_start = page_boundary["start_offset"]
                page_end = page_boundary["end_offset"]
                page_text_length = page_end - page_start

                # Calculate what portion of the chunk appears on this page
                chunk_start_on_page = max(start_offset, page_start)
                chunk_end_on_page = min(end_offset, page_end)

                # Extract just the text that appears on this page
                page_relative_text = full_text[chunk_start_on_page:chunk_end_on_page]

                chunks_by_page[page_num].append(
                    (chunk_index, chunk_page_info, page_relative_text, page_text_length)
                )

            logger.debug(
                f"Chunks distributed across {len(chunks_by_page)} unique pages"
            )

            # Process each chunk, rendering with only its own highlights
            # Store original page contents to restore between chunks
            page_contents_cache: dict[int, list[bytes]] = {}

            for page_num, page_chunks in chunks_by_page.items():
                page = doc[page_num - 1]

                # Cache original page contents (before any highlights added)
                # xref is the PDF object reference for each content stream
                if page_num not in page_contents_cache:
                    page_contents_cache[page_num] = []
                    xrefs = page.get_contents()
                    for xref in xrefs:
                        page_contents_cache[page_num].append(doc.xref_stream(xref))

                for (
                    chunk_index,
                    chunk_page_info,
                    chunk_text,
                    page_text_length,
                ) in page_chunks:
                    try:
                        # Restore original page contents to remove previous highlights
                        # Highlights are drawn shapes, not annotations, so we must
                        # restore the content stream to clear them
                        xrefs = page.get_contents()
                        for i, xref in enumerate(xrefs):
                            if i < len(page_contents_cache[page_num]):
                                doc.update_stream(
                                    xref, page_contents_cache[page_num][i]
                                )

                        # Add highlights for this chunk with region constraint
                        page_relative_start = chunk_page_info["page_relative_start"]
                        page_relative_end = chunk_page_info["page_relative_end"]
                        highlight_count = PDFHighlighter.highlight_chunk_on_page(
                            page,
                            chunk_text,
                            color,
                            page_relative_start=page_relative_start,
                            page_relative_end=page_relative_end,
                            page_text_length=page_text_length,
                        )

                        if highlight_count == 0:
                            logger.warning(f"Chunk {chunk_index}: no highlights added")
                            continue

                        # Render page to PNG
                        mat = pymupdf.Matrix(zoom, zoom)
                        pix = page.get_pixmap(matrix=mat, alpha=False)
                        png_bytes = pix.tobytes("png")

                        results[chunk_index] = (png_bytes, page_num, highlight_count)

                        logger.debug(
                            f"Chunk {chunk_index}: {len(png_bytes):,} bytes, "
                            f"page {page_num}, {highlight_count} highlights"
                        )

                    except Exception as e:
                        logger.error(f"Chunk {chunk_index}: error - {e}")
                        continue

            doc.close()

            logger.info(
                f"Batch highlighted {len(results)}/{len(chunks)} chunks successfully"
            )

            return results

        except Exception as e:
            logger.error(f"Error in batch highlighting: {e}", exc_info=True)
            return results

        finally:
            # Clean up temp directory
            if temp_pdf_path and temp_pdf_path.parent.exists():
                try:
                    shutil.rmtree(temp_pdf_path.parent)
                except Exception as e:
                    logger.warning(f"Failed to clean up temp dir: {e}")