fix: Remove pymupdf.layout.activate() to fix page_chunks behavior

pymupdf.layout.activate() causes pymupdf4llm.to_markdown() to ignore the page_chunks=True option, returning a single string instead of list[dict]. This broke per-page chunking needed for semantic search indexing. See: https://github.com/pymupdf/pymupdf4llm/issues/323 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 16:58:35 +01:00
parent ba8a53803a
commit 8baa07db84
1 changed files with 5 additions and 5 deletions
@@ -6,15 +6,15 @@ import tempfile
 from collections.abc import Awaitable, Callable
 from typing import Any, Optional

+# NOTE: Do NOT call pymupdf.layout.activate() here!
+# It changes the behavior of pymupdf4llm.to_markdown() when page_chunks=True,
+# causing it to return a string instead of a list[dict].
+# See: https://github.com/pymupdf/pymupdf4llm/issues/323
 import pymupdf
-import pymupdf.layout
+import pymupdf4llm

 from .base import DocumentProcessor, ProcessingResult, ProcessorError

-# Activate layout analysis for better text extraction
-pymupdf.layout.activate()
-import pymupdf4llm  # noqa
-
 logger = logging.getLogger(__name__)