perf: Optimize PDF processing with parallel extraction and single-render highlights

Phase 1 - PDF Highlighting Optimization:
- Render each page ONCE instead of once per chunk (N chunks = 1 render, not N)
- Use PIL to draw bounding boxes on copied base images (fast) instead of
  re-rendering page via pymupdf (slow)
- Add _find_chunk_bbox() to extract bbox without modifying page

Phase 2 - Parallel Page Extraction:
- Use anyio task group with run_sync() for parallel page extraction
- Each page extracted in separate thread via anyio.to_thread.run_sync()
- Event loop stays responsive during extraction
- Remove obsolete _process_sync() method

Expected improvement: 30-50% reduction in total PDF processing time.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-11-22 03:11:56 +01:00
parent fffe483c02
commit 31fade9730
2 changed files with 212 additions and 132 deletions
@@ -99,129 +99,132 @@ class PyMuPDFProcessor(DocumentProcessor):
try:
if progress_callback:
await progress_callback(0, 100, "Processing PDF in background thread")
await progress_callback(0, 100, "Opening PDF document")
# Run CPU-bound PDF processing in thread pool to avoid blocking event loop
result = await anyio.to_thread.run_sync( # type: ignore[attr-defined]
self._process_sync,
content,
filename,
# Open document and extract metadata in thread
doc = await anyio.to_thread.run_sync( # type: ignore[attr-defined]
lambda: pymupdf.open("pdf", content)
)
metadata = self._extract_metadata(doc, filename)
metadata["file_size"] = len(content)
page_count = doc.page_count
if progress_callback:
await progress_callback(10, 100, f"Extracting {page_count} pages")
# Prepare image directory if needed
pdf_image_dir = None
if self.extract_images:
pdf_id = filename.replace("/", "_") if filename else "unknown"
pdf_image_dir = self.image_dir / pdf_id
pdf_image_dir.mkdir(exist_ok=True, parents=True)
# OPTIMIZATION: Extract pages in parallel using anyio task group
page_texts = await self._extract_pages_parallel(
doc, page_count, pdf_image_dir
)
if progress_callback:
await progress_callback(90, 100, "Building result")
# Calculate page boundaries (sequential, fast)
page_boundaries = []
current_offset = 0
for page_num, page_md in enumerate(page_texts):
page_boundaries.append(
{
"page": page_num + 1,
"start_offset": current_offset,
"end_offset": current_offset + len(page_md),
}
)
current_offset += len(page_md)
# Collect image paths
image_paths = []
if pdf_image_dir and pdf_image_dir.exists():
image_paths = [str(p) for p in pdf_image_dir.glob("*")]
# Build final text and metadata
md_text = "".join(page_texts)
metadata["has_images"] = len(image_paths) > 0
if image_paths:
metadata["image_count"] = len(image_paths)
metadata["image_paths"] = image_paths
metadata["page_boundaries"] = page_boundaries
# Close document
doc.close()
if progress_callback:
await progress_callback(100, 100, "Processing complete")
return result
logger.info(
f"Successfully processed PDF {filename or '<bytes>'}: "
f"{metadata['page_count']} pages, {len(md_text)} chars, "
f"{metadata.get('image_count', 0)} images"
)
return ProcessingResult(
text=md_text,
metadata=metadata,
processor=self.name,
success=True,
)
except Exception as e:
error_msg = f"Failed to process PDF {filename or '<bytes>'}: {e}"
logger.error(error_msg, exc_info=True)
raise ProcessorError(error_msg) from e
def _process_sync(
async def _extract_pages_parallel(
self,
content: bytes,
filename: Optional[str] = None,
) -> ProcessingResult:
"""Synchronous PDF processing (runs in thread pool).
doc: pymupdf.Document,
page_count: int,
pdf_image_dir: pathlib.Path | None,
) -> list[str]:
"""Extract text from all pages in parallel using anyio.
Args:
content: PDF document bytes
filename: Optional filename for better error messages
doc: Opened PyMuPDF document
page_count: Number of pages to extract
pdf_image_dir: Directory for extracted images (or None)
Returns:
ProcessingResult with extracted text and metadata
Raises:
Exception: If PDF processing fails
List of page texts in order
"""
# Open PDF from bytes
doc = pymupdf.open("pdf", content)
import anyio
# Extract metadata from PDF
metadata = self._extract_metadata(doc, filename)
results: list[str | None] = [None] * page_count
# Add file size to metadata
metadata["file_size"] = len(content)
async def extract_one(page_num: int) -> None:
"""Extract single page in thread pool."""
# Extract text page-by-page to preserve page boundaries
# pymupdf.layout.activate() causes page_chunks=True to return a string,
# so we manually extract text per page instead.
page_boundaries = []
current_offset = 0
full_text_parts = []
image_paths = []
for page_num in range(doc.page_count):
if self.extract_images:
# Generate unique directory for this PDF's images
pdf_id = filename.replace("/", "_") if filename else "unknown"
pdf_image_dir = self.image_dir / pdf_id
pdf_image_dir.mkdir(exist_ok=True, parents=True)
# Extract page as markdown with images
page_md = pymupdf4llm.to_markdown(
def do_extract() -> str:
return pymupdf4llm.to_markdown(
doc,
pages=[page_num], # Extract single page
write_images=True,
image_path=pdf_image_dir,
page_chunks=False, # Single page, no chunking needed
pages=[page_num],
write_images=self.extract_images,
image_path=pdf_image_dir if self.extract_images else None,
page_chunks=False,
)
# Collect image paths
if pdf_image_dir.exists():
page_images = [str(p) for p in pdf_image_dir.glob("*")]
image_paths.extend(page_images)
else:
# Extract page as markdown without images
page_md = pymupdf4llm.to_markdown(
doc,
pages=[page_num], # Extract single page
write_images=False,
page_chunks=False, # Single page, no chunking needed
)
results[page_num] = await anyio.to_thread.run_sync(do_extract) # type: ignore[attr-defined]
# Store page text
full_text_parts.append(page_md)
# Run all page extractions in parallel
async with anyio.create_task_group() as tg:
for page_num in range(page_count):
tg.start_soon(extract_one, page_num)
# Store boundary info: {page (1-indexed), start, end}
page_boundaries.append(
{
"page": page_num + 1, # Convert to 1-indexed
"start_offset": current_offset,
"end_offset": current_offset + len(page_md),
}
)
# Verify all pages extracted
final_results: list[str] = []
for i, text in enumerate(results):
if text is None:
raise ProcessorError(f"Page {i} extraction failed")
final_results.append(text)
current_offset += len(page_md)
# Join all page texts
md_text = "".join(full_text_parts)
# Store image metadata
metadata["has_images"] = len(image_paths) > 0
if image_paths:
metadata["image_count"] = len(image_paths)
metadata["image_paths"] = image_paths
# Add page boundaries to metadata for chunker to use
metadata["page_boundaries"] = page_boundaries
# Close the document
doc.close()
logger.info(
f"Successfully processed PDF {filename or '<bytes>'}: "
f"{metadata['page_count']} pages, {len(md_text)} chars, "
f"{metadata.get('image_count', 0)} images"
)
return ProcessingResult(
text=md_text,
metadata=metadata,
processor=self.name,
success=True,
)
return final_results
def _extract_metadata(
self, doc: pymupdf.Document, filename: Optional[str]
+113 -36
View File
@@ -393,6 +393,65 @@ class PDFHighlighter:
return clean_text if clean_text else None
@staticmethod
def _find_chunk_bbox(
page: pymupdf.Page,
chunk_text: str,
page_relative_start: int,
page_relative_end: int,
page_text_length: int,
) -> tuple[float, float, float, float] | None:
"""Find bounding box for a chunk without modifying the page.
Returns (x0, y0, x1, y1) in page coordinates, or None if not found.
"""
page_rect = page.rect
# Strip markdown for searching
search_text = PDFHighlighter.strip_markdown(chunk_text)
# Try to find chunk location using text search
anchor_rect = None
search_phrases = []
# Build search phrases from chunk text
sentences = re.split(r"[.!?]\s+", search_text)
for sentence in sentences[:3]:
sentence = sentence.strip()
if len(sentence) >= 20:
search_phrases.append(sentence[:80])
if len(sentence) >= 40:
search_phrases.append(sentence[:40])
# Also try first N characters
if len(search_text) >= 30:
search_phrases.append(search_text[:60])
search_phrases.append(search_text[:30])
for phrase in search_phrases:
if not phrase:
continue
rects = page.search_for(phrase.strip())
if rects:
anchor_rect = rects[0]
break
if not anchor_rect:
return None
# Calculate chunk height based on character count
chunk_chars = len(search_text)
estimated_lines = max(1, chunk_chars / 60)
estimated_height = estimated_lines * 14
# Build bounding box
return (
page_rect.x0 + 30, # Left margin
anchor_rect.y0 - 5, # Start slightly above anchor
page_rect.x1 - 30, # Right margin
min(anchor_rect.y0 + estimated_height + 10, page_rect.y1 - 30),
)
@staticmethod
def highlight_chunk_on_page(
page: pymupdf.Page,
@@ -739,20 +798,32 @@ class PDFHighlighter:
f"Chunks distributed across {len(chunks_by_page)} unique pages"
)
# Process each chunk, rendering with only its own highlights
# Store original page contents to restore between chunks
page_contents_cache: dict[int, list[bytes]] = {}
# OPTIMIZATION: Render each page ONCE, then draw highlights using PIL
# This avoids expensive page.get_pixmap() calls per chunk
from io import BytesIO
from PIL import Image, ImageDraw
# PIL color for bounding box (RGB tuple)
rgb = PDFHighlighter.COLORS.get(color, PDFHighlighter.COLORS["yellow"])
pil_color = tuple(int(c * 255) for c in rgb)
fill_color = (255, 255, 178, 38) # Light yellow with alpha
for page_num, page_chunks in chunks_by_page.items():
page = doc[page_num - 1]
# Cache original page contents (before any highlights added)
# xref is the PDF object reference for each content stream
if page_num not in page_contents_cache:
page_contents_cache[page_num] = []
xrefs = page.get_contents()
for xref in xrefs:
page_contents_cache[page_num].append(doc.xref_stream(xref))
# Render page ONCE to get base image (most expensive operation)
mat = pymupdf.Matrix(zoom, zoom)
base_pix = page.get_pixmap(matrix=mat, alpha=False)
base_png = base_pix.tobytes("png")
# Convert to PIL Image for fast highlight drawing
base_image = Image.open(BytesIO(base_png)).convert("RGBA")
page_rect = page.rect
logger.debug(
f"Page {page_num}: rendered once, processing {len(page_chunks)} chunks"
)
for (
chunk_index,
@@ -761,42 +832,48 @@ class PDFHighlighter:
page_text_length,
) in page_chunks:
try:
# Restore original page contents to remove previous highlights
# Highlights are drawn shapes, not annotations, so we must
# restore the content stream to clear them
xrefs = page.get_contents()
for i, xref in enumerate(xrefs):
if i < len(page_contents_cache[page_num]):
doc.update_stream(
xref, page_contents_cache[page_num][i]
)
# Add highlights for this chunk with region constraint
page_relative_start = chunk_page_info["page_relative_start"]
page_relative_end = chunk_page_info["page_relative_end"]
highlight_count = PDFHighlighter.highlight_chunk_on_page(
# Find chunk bounding box using text search
bbox = PDFHighlighter._find_chunk_bbox(
page,
chunk_text,
color,
page_relative_start=page_relative_start,
page_relative_end=page_relative_end,
page_text_length=page_text_length,
chunk_page_info["page_relative_start"],
chunk_page_info["page_relative_end"],
page_text_length,
)
if highlight_count == 0:
logger.warning(f"Chunk {chunk_index}: no highlights added")
if bbox is None:
logger.warning(f"Chunk {chunk_index}: could not find bbox")
continue
# Render page to PNG
mat = pymupdf.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
png_bytes = pix.tobytes("png")
# Copy base image and draw highlight using PIL (fast!)
chunk_image = base_image.copy()
draw = ImageDraw.Draw(chunk_image, "RGBA")
results[chunk_index] = (png_bytes, page_num, highlight_count)
# Scale bbox coordinates to pixmap coordinates
scale_x = base_pix.width / page_rect.width
scale_y = base_pix.height / page_rect.height
pil_bbox = (
int(bbox[0] * scale_x),
int(bbox[1] * scale_y),
int(bbox[2] * scale_x),
int(bbox[3] * scale_y),
)
# Draw semi-transparent fill
draw.rectangle(pil_bbox, fill=fill_color)
# Draw dashed border (PIL doesn't support dashes, use solid)
draw.rectangle(pil_bbox, outline=pil_color, width=3)
# Convert back to PNG bytes
output = BytesIO()
chunk_image.convert("RGB").save(output, format="PNG")
png_bytes = output.getvalue()
results[chunk_index] = (png_bytes, page_num, 1)
logger.debug(
f"Chunk {chunk_index}: {len(png_bytes):,} bytes, "
f"page {page_num}, {highlight_count} highlights"
f"page {page_num}, bbox {pil_bbox}"
)
except Exception as e: