fffe483c02
Previously, pymupdf4llm.to_markdown() was called twice - once in PyMuPDFProcessor during indexing and again in PDFHighlighter during visualization. Different image path lengths caused different character offsets, leading to highlighted pages not matching their chunks. Also fixed issue where all chunks on the same page showed all highlights instead of just their own highlight. Now restores original page contents between chunks using xref stream caching. Changes: - Add PDFHighlighter class requiring pre-computed page_boundaries and full_text from document processor (no fallback extraction) - Pass pre-computed data from processor to highlighter - Extract page-relative portion of chunk text for cross-page chunks - Add bounding box highlighting using text anchor search - Run highlight generation in parallel with embedding/BM25 - Cache and restore page contents to isolate highlights per chunk Results: Highlighting success rate improved from 51% to 95% (121/128). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
286 lines
9.3 KiB
Python
286 lines
9.3 KiB
Python
"""Document processor using PyMuPDF (fitz) library."""
|
|
|
|
import logging
|
|
import pathlib
|
|
import tempfile
|
|
from collections.abc import Awaitable, Callable
|
|
from typing import Any, Optional
|
|
|
|
import pymupdf
|
|
import pymupdf.layout
|
|
|
|
from .base import DocumentProcessor, ProcessingResult, ProcessorError
|
|
|
|
# Activate layout analysis for better text extraction
|
|
pymupdf.layout.activate()
|
|
import pymupdf4llm # noqa
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PyMuPDFProcessor(DocumentProcessor):
|
|
"""Document processor using PyMuPDF library for PDF processing.
|
|
|
|
PyMuPDF (fitz) is a fast, local PDF processing library that extracts text,
|
|
metadata, and images without requiring external API calls.
|
|
|
|
Features:
|
|
- Fast text extraction with layout preservation
|
|
- PDF metadata extraction (title, author, creation date, page count)
|
|
- Image extraction for future multimodal support
|
|
- Page number tracking for precise citations
|
|
"""
|
|
|
|
SUPPORTED_TYPES = {
|
|
"application/pdf",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
extract_images: bool = True,
|
|
image_dir: Optional[str | pathlib.Path] = None,
|
|
):
|
|
"""Initialize PyMuPDF processor.
|
|
|
|
Args:
|
|
extract_images: Whether to extract embedded images from PDFs
|
|
image_dir: Directory to store extracted images (defaults to temp directory)
|
|
"""
|
|
self.extract_images = extract_images
|
|
|
|
if image_dir is None:
|
|
self.image_dir = pathlib.Path(tempfile.gettempdir()) / "pdf-images"
|
|
else:
|
|
self.image_dir = pathlib.Path(image_dir)
|
|
|
|
# Create image directory if it doesn't exist
|
|
if self.extract_images:
|
|
self.image_dir.mkdir(exist_ok=True, parents=True)
|
|
logger.info(
|
|
f"Initialized PyMuPDFProcessor with image extraction to {self.image_dir}"
|
|
)
|
|
else:
|
|
logger.info("Initialized PyMuPDFProcessor without image extraction")
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "pymupdf"
|
|
|
|
@property
|
|
def supported_mime_types(self) -> set[str]:
|
|
return self.SUPPORTED_TYPES
|
|
|
|
async def process(
|
|
self,
|
|
content: bytes,
|
|
content_type: str,
|
|
filename: Optional[str] = None,
|
|
options: Optional[dict[str, Any]] = None,
|
|
progress_callback: Optional[
|
|
Callable[[float, Optional[float], Optional[str]], Awaitable[None]]
|
|
] = None,
|
|
) -> ProcessingResult:
|
|
"""Process a PDF document and extract text, metadata, and images.
|
|
|
|
Args:
|
|
content: PDF document bytes
|
|
content_type: MIME type (should be application/pdf)
|
|
filename: Optional filename for better error messages
|
|
options: Processing options (currently unused)
|
|
progress_callback: Optional callback for progress updates
|
|
|
|
Returns:
|
|
ProcessingResult with extracted text and metadata
|
|
|
|
Raises:
|
|
ProcessorError: If PDF processing fails
|
|
"""
|
|
import anyio
|
|
|
|
try:
|
|
if progress_callback:
|
|
await progress_callback(0, 100, "Processing PDF in background thread")
|
|
|
|
# Run CPU-bound PDF processing in thread pool to avoid blocking event loop
|
|
result = await anyio.to_thread.run_sync( # type: ignore[attr-defined]
|
|
self._process_sync,
|
|
content,
|
|
filename,
|
|
)
|
|
|
|
if progress_callback:
|
|
await progress_callback(100, 100, "Processing complete")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Failed to process PDF {filename or '<bytes>'}: {e}"
|
|
logger.error(error_msg, exc_info=True)
|
|
raise ProcessorError(error_msg) from e
|
|
|
|
def _process_sync(
|
|
self,
|
|
content: bytes,
|
|
filename: Optional[str] = None,
|
|
) -> ProcessingResult:
|
|
"""Synchronous PDF processing (runs in thread pool).
|
|
|
|
Args:
|
|
content: PDF document bytes
|
|
filename: Optional filename for better error messages
|
|
|
|
Returns:
|
|
ProcessingResult with extracted text and metadata
|
|
|
|
Raises:
|
|
Exception: If PDF processing fails
|
|
"""
|
|
# Open PDF from bytes
|
|
doc = pymupdf.open("pdf", content)
|
|
|
|
# Extract metadata from PDF
|
|
metadata = self._extract_metadata(doc, filename)
|
|
|
|
# Add file size to metadata
|
|
metadata["file_size"] = len(content)
|
|
|
|
# Extract text page-by-page to preserve page boundaries
|
|
# pymupdf.layout.activate() causes page_chunks=True to return a string,
|
|
# so we manually extract text per page instead.
|
|
page_boundaries = []
|
|
current_offset = 0
|
|
full_text_parts = []
|
|
image_paths = []
|
|
|
|
for page_num in range(doc.page_count):
|
|
if self.extract_images:
|
|
# Generate unique directory for this PDF's images
|
|
pdf_id = filename.replace("/", "_") if filename else "unknown"
|
|
pdf_image_dir = self.image_dir / pdf_id
|
|
pdf_image_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
# Extract page as markdown with images
|
|
page_md = pymupdf4llm.to_markdown(
|
|
doc,
|
|
pages=[page_num], # Extract single page
|
|
write_images=True,
|
|
image_path=pdf_image_dir,
|
|
page_chunks=False, # Single page, no chunking needed
|
|
)
|
|
|
|
# Collect image paths
|
|
if pdf_image_dir.exists():
|
|
page_images = [str(p) for p in pdf_image_dir.glob("*")]
|
|
image_paths.extend(page_images)
|
|
else:
|
|
# Extract page as markdown without images
|
|
page_md = pymupdf4llm.to_markdown(
|
|
doc,
|
|
pages=[page_num], # Extract single page
|
|
write_images=False,
|
|
page_chunks=False, # Single page, no chunking needed
|
|
)
|
|
|
|
# Store page text
|
|
full_text_parts.append(page_md)
|
|
|
|
# Store boundary info: {page (1-indexed), start, end}
|
|
page_boundaries.append(
|
|
{
|
|
"page": page_num + 1, # Convert to 1-indexed
|
|
"start_offset": current_offset,
|
|
"end_offset": current_offset + len(page_md),
|
|
}
|
|
)
|
|
|
|
current_offset += len(page_md)
|
|
|
|
# Join all page texts
|
|
md_text = "".join(full_text_parts)
|
|
|
|
# Store image metadata
|
|
metadata["has_images"] = len(image_paths) > 0
|
|
if image_paths:
|
|
metadata["image_count"] = len(image_paths)
|
|
metadata["image_paths"] = image_paths
|
|
|
|
# Add page boundaries to metadata for chunker to use
|
|
metadata["page_boundaries"] = page_boundaries
|
|
|
|
# Close the document
|
|
doc.close()
|
|
|
|
logger.info(
|
|
f"Successfully processed PDF {filename or '<bytes>'}: "
|
|
f"{metadata['page_count']} pages, {len(md_text)} chars, "
|
|
f"{metadata.get('image_count', 0)} images"
|
|
)
|
|
|
|
return ProcessingResult(
|
|
text=md_text,
|
|
metadata=metadata,
|
|
processor=self.name,
|
|
success=True,
|
|
)
|
|
|
|
def _extract_metadata(
|
|
self, doc: pymupdf.Document, filename: Optional[str]
|
|
) -> dict[str, Any]:
|
|
"""Extract metadata from PDF document.
|
|
|
|
Args:
|
|
doc: Opened PyMuPDF document
|
|
filename: Optional filename
|
|
|
|
Returns:
|
|
Dictionary with PDF metadata
|
|
"""
|
|
metadata: dict[str, Any] = {}
|
|
|
|
# Basic document info
|
|
metadata["page_count"] = doc.page_count
|
|
metadata["format"] = "PDF 1." + str(
|
|
doc.pdf_version() if hasattr(doc, "pdf_version") else "?" # type: ignore[call-non-callable]
|
|
)
|
|
|
|
if filename:
|
|
metadata["filename"] = filename
|
|
|
|
# Extract PDF metadata dictionary
|
|
pdf_metadata = doc.metadata
|
|
if pdf_metadata:
|
|
# Standard PDF metadata fields
|
|
if pdf_metadata.get("title"):
|
|
metadata["title"] = pdf_metadata["title"]
|
|
if pdf_metadata.get("author"):
|
|
metadata["author"] = pdf_metadata["author"]
|
|
if pdf_metadata.get("subject"):
|
|
metadata["subject"] = pdf_metadata["subject"]
|
|
if pdf_metadata.get("keywords"):
|
|
metadata["keywords"] = pdf_metadata["keywords"]
|
|
if pdf_metadata.get("creator"):
|
|
metadata["creator"] = pdf_metadata["creator"]
|
|
if pdf_metadata.get("producer"):
|
|
metadata["producer"] = pdf_metadata["producer"]
|
|
if pdf_metadata.get("creationDate"):
|
|
metadata["creation_date"] = pdf_metadata["creationDate"]
|
|
if pdf_metadata.get("modDate"):
|
|
metadata["modification_date"] = pdf_metadata["modDate"]
|
|
|
|
return metadata
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Check if PyMuPDF is available and working.
|
|
|
|
Returns:
|
|
True if processor is ready to use
|
|
"""
|
|
try:
|
|
# Try to create a simple PDF in memory
|
|
test_doc = pymupdf.open()
|
|
test_doc.close()
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"PyMuPDF health check failed: {e}")
|
|
return False
|