Files
nextcloud-mcp-server/nextcloud_mcp_server/document_processors/pymupdf.py
T
Chris Coutinho b8010270c1 fix: Add async/await, PDF metadata, and type safety fixes
This commit addresses multiple issues with async operations, PDF metadata
extraction, and type safety in document processing and search.

## Async/Await Fixes
- processor.py:259 - Added await for chunker.chunk_text(content)
- processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts)
- tests/unit/test_document_chunker.py - Converted all 12 test methods to async

## PDF Metadata Enhancement
- pymupdf.py:143 - Added file_size metadata extraction
- pymupdf.py:145-206 - Refactored to extract text page-by-page
  - Manually loop through pages instead of using page_chunks=True
  - Generate page_boundaries metadata for precise page tracking
  - Works around pymupdf.layout.activate() breaking page_chunks=True
- processor.py:32-66 - Added assign_page_numbers() helper function
  - Assigns page numbers to chunks based on overlap with page boundaries
  - Handles chunks spanning multiple pages
- processor.py:298-300 - Call assign_page_numbers() for PDF files

## Type Safety Fixes
- bm25_hybrid.py:184 - Removed int() conversion of doc_id
- semantic.py:131 - Removed int() conversion of doc_id
- viz_routes.py:275 - Removed int() conversion of doc_id
- Added comments documenting that doc_id can be int (notes) or str (file paths)

## Testing
- All 18 tests passing (12 unit + 6 integration)
- No type errors in modified files
- Container logs show successful processing
- Vector viz searches working correctly

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 02:37:07 +01:00

286 lines
9.3 KiB
Python

"""Document processor using PyMuPDF (fitz) library."""
import logging
import pathlib
import tempfile
from collections.abc import Awaitable, Callable
from typing import Any, Optional
import pymupdf
import pymupdf.layout
import pymupdf4llm
from .base import DocumentProcessor, ProcessingResult, ProcessorError
# Activate layout analysis for better text extraction
pymupdf.layout.activate()
logger = logging.getLogger(__name__)
class PyMuPDFProcessor(DocumentProcessor):
"""Document processor using PyMuPDF library for PDF processing.
PyMuPDF (fitz) is a fast, local PDF processing library that extracts text,
metadata, and images without requiring external API calls.
Features:
- Fast text extraction with layout preservation
- PDF metadata extraction (title, author, creation date, page count)
- Image extraction for future multimodal support
- Page number tracking for precise citations
"""
SUPPORTED_TYPES = {
"application/pdf",
}
def __init__(
self,
extract_images: bool = True,
image_dir: Optional[str | pathlib.Path] = None,
):
"""Initialize PyMuPDF processor.
Args:
extract_images: Whether to extract embedded images from PDFs
image_dir: Directory to store extracted images (defaults to temp directory)
"""
self.extract_images = extract_images
if image_dir is None:
self.image_dir = pathlib.Path(tempfile.gettempdir()) / "pdf-images"
else:
self.image_dir = pathlib.Path(image_dir)
# Create image directory if it doesn't exist
if self.extract_images:
self.image_dir.mkdir(exist_ok=True, parents=True)
logger.info(
f"Initialized PyMuPDFProcessor with image extraction to {self.image_dir}"
)
else:
logger.info("Initialized PyMuPDFProcessor without image extraction")
@property
def name(self) -> str:
return "pymupdf"
@property
def supported_mime_types(self) -> set[str]:
return self.SUPPORTED_TYPES
async def process(
self,
content: bytes,
content_type: str,
filename: Optional[str] = None,
options: Optional[dict[str, Any]] = None,
progress_callback: Optional[
Callable[[float, Optional[float], Optional[str]], Awaitable[None]]
] = None,
) -> ProcessingResult:
"""Process a PDF document and extract text, metadata, and images.
Args:
content: PDF document bytes
content_type: MIME type (should be application/pdf)
filename: Optional filename for better error messages
options: Processing options (currently unused)
progress_callback: Optional callback for progress updates
Returns:
ProcessingResult with extracted text and metadata
Raises:
ProcessorError: If PDF processing fails
"""
import anyio
try:
if progress_callback:
await progress_callback(0, 100, "Processing PDF in background thread")
# Run CPU-bound PDF processing in thread pool to avoid blocking event loop
result = await anyio.to_thread.run_sync(
self._process_sync,
content,
filename,
)
if progress_callback:
await progress_callback(100, 100, "Processing complete")
return result
except Exception as e:
error_msg = f"Failed to process PDF {filename or '<bytes>'}: {e}"
logger.error(error_msg, exc_info=True)
raise ProcessorError(error_msg) from e
def _process_sync(
self,
content: bytes,
filename: Optional[str] = None,
) -> ProcessingResult:
"""Synchronous PDF processing (runs in thread pool).
Args:
content: PDF document bytes
filename: Optional filename for better error messages
Returns:
ProcessingResult with extracted text and metadata
Raises:
Exception: If PDF processing fails
"""
# Open PDF from bytes
doc = pymupdf.open("pdf", content)
# Extract metadata from PDF
metadata = self._extract_metadata(doc, filename)
# Add file size to metadata
metadata["file_size"] = len(content)
# Extract text page-by-page to preserve page boundaries
# pymupdf.layout.activate() causes page_chunks=True to return a string,
# so we manually extract text per page instead.
page_boundaries = []
current_offset = 0
full_text_parts = []
image_paths = []
for page_num in range(doc.page_count):
if self.extract_images:
# Generate unique directory for this PDF's images
pdf_id = filename.replace("/", "_") if filename else "unknown"
pdf_image_dir = self.image_dir / pdf_id
pdf_image_dir.mkdir(exist_ok=True, parents=True)
# Extract page as markdown with images
page_md = pymupdf4llm.to_markdown(
doc,
pages=[page_num], # Extract single page
write_images=True,
image_path=pdf_image_dir,
page_chunks=False, # Single page, no chunking needed
)
# Collect image paths
if pdf_image_dir.exists():
page_images = [str(p) for p in pdf_image_dir.glob("*")]
image_paths.extend(page_images)
else:
# Extract page as markdown without images
page_md = pymupdf4llm.to_markdown(
doc,
pages=[page_num], # Extract single page
write_images=False,
page_chunks=False, # Single page, no chunking needed
)
# Store page text
full_text_parts.append(page_md)
# Store boundary info: {page (1-indexed), start, end}
page_boundaries.append(
{
"page": page_num + 1, # Convert to 1-indexed
"start_offset": current_offset,
"end_offset": current_offset + len(page_md),
}
)
current_offset += len(page_md)
# Join all page texts
md_text = "".join(full_text_parts)
# Store image metadata
metadata["has_images"] = len(image_paths) > 0
if image_paths:
metadata["image_count"] = len(image_paths)
metadata["image_paths"] = image_paths
# Add page boundaries to metadata for chunker to use
metadata["page_boundaries"] = page_boundaries
# Close the document
doc.close()
logger.info(
f"Successfully processed PDF {filename or '<bytes>'}: "
f"{metadata['page_count']} pages, {len(md_text)} chars, "
f"{metadata.get('image_count', 0)} images"
)
return ProcessingResult(
text=md_text,
metadata=metadata,
processor=self.name,
success=True,
)
def _extract_metadata(
self, doc: pymupdf.Document, filename: Optional[str]
) -> dict[str, Any]:
"""Extract metadata from PDF document.
Args:
doc: Opened PyMuPDF document
filename: Optional filename
Returns:
Dictionary with PDF metadata
"""
metadata: dict[str, Any] = {}
# Basic document info
metadata["page_count"] = doc.page_count
metadata["format"] = "PDF 1." + str(
doc.pdf_version() if hasattr(doc, "pdf_version") else "?"
)
if filename:
metadata["filename"] = filename
# Extract PDF metadata dictionary
pdf_metadata = doc.metadata
if pdf_metadata:
# Standard PDF metadata fields
if pdf_metadata.get("title"):
metadata["title"] = pdf_metadata["title"]
if pdf_metadata.get("author"):
metadata["author"] = pdf_metadata["author"]
if pdf_metadata.get("subject"):
metadata["subject"] = pdf_metadata["subject"]
if pdf_metadata.get("keywords"):
metadata["keywords"] = pdf_metadata["keywords"]
if pdf_metadata.get("creator"):
metadata["creator"] = pdf_metadata["creator"]
if pdf_metadata.get("producer"):
metadata["producer"] = pdf_metadata["producer"]
if pdf_metadata.get("creationDate"):
metadata["creation_date"] = pdf_metadata["creationDate"]
if pdf_metadata.get("modDate"):
metadata["modification_date"] = pdf_metadata["modDate"]
return metadata
async def health_check(self) -> bool:
"""Check if PyMuPDF is available and working.
Returns:
True if processor is ready to use
"""
try:
# Try to create a simple PDF in memory
test_doc = pymupdf.open()
test_doc.close()
return True
except Exception as e:
logger.error(f"PyMuPDF health check failed: {e}")
return False