fix: Remove pymupdf.layout.activate() to fix page_chunks behavior
pymupdf.layout.activate() causes pymupdf4llm.to_markdown() to ignore the page_chunks=True option, returning a single string instead of list[dict]. This broke per-page chunking needed for semantic search indexing. See: https://github.com/pymupdf/pymupdf4llm/issues/323 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -6,15 +6,15 @@ import tempfile
|
||||
from collections.abc import Awaitable, Callable
|
||||
from typing import Any, Optional
|
||||
|
||||
# NOTE: Do NOT call pymupdf.layout.activate() here!
|
||||
# It changes the behavior of pymupdf4llm.to_markdown() when page_chunks=True,
|
||||
# causing it to return a string instead of a list[dict].
|
||||
# See: https://github.com/pymupdf/pymupdf4llm/issues/323
|
||||
import pymupdf
|
||||
import pymupdf.layout
|
||||
import pymupdf4llm
|
||||
|
||||
from .base import DocumentProcessor, ProcessingResult, ProcessorError
|
||||
|
||||
# Activate layout analysis for better text extraction
|
||||
pymupdf.layout.activate()
|
||||
import pymupdf4llm # noqa
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user