From 8baa07db84907b079931327f6262231f74aa0311 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 22 Nov 2025 16:58:35 +0100 Subject: [PATCH] fix: Remove pymupdf.layout.activate() to fix page_chunks behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pymupdf.layout.activate() causes pymupdf4llm.to_markdown() to ignore the page_chunks=True option, returning a single string instead of list[dict]. This broke per-page chunking needed for semantic search indexing. See: https://github.com/pymupdf/pymupdf4llm/issues/323 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/document_processors/pymupdf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nextcloud_mcp_server/document_processors/pymupdf.py b/nextcloud_mcp_server/document_processors/pymupdf.py index 2445a27..be2cb6a 100644 --- a/nextcloud_mcp_server/document_processors/pymupdf.py +++ b/nextcloud_mcp_server/document_processors/pymupdf.py @@ -6,15 +6,15 @@ import tempfile from collections.abc import Awaitable, Callable from typing import Any, Optional +# NOTE: Do NOT call pymupdf.layout.activate() here! +# It changes the behavior of pymupdf4llm.to_markdown() when page_chunks=True, +# causing it to return a string instead of a list[dict]. +# See: https://github.com/pymupdf/pymupdf4llm/issues/323 import pymupdf -import pymupdf.layout +import pymupdf4llm from .base import DocumentProcessor, ProcessingResult, ProcessorError -# Activate layout analysis for better text extraction -pymupdf.layout.activate() -import pymupdf4llm # noqa - logger = logging.getLogger(__name__)