nextcloud-mcp-server/nextcloud_mcp_server/vector/document_chunker.py

"""Document chunking for large texts using LangChain text splitters."""

import logging
from dataclasses import dataclass

from langchain_text_splitters import MarkdownTextSplitter

logger = logging.getLogger(__name__)


@dataclass
class ChunkWithPosition:
    """A text chunk with its character position in the original document."""

    text: str
    start_offset: int  # Character position where chunk starts
    end_offset: int  # Character position where chunk ends (exclusive)


class DocumentChunker:
    """Chunk large documents for optimal embedding using LangChain text splitters.

    Uses MarkdownTextSplitter which is optimized for Markdown content like
    Nextcloud Notes. Respects markdown structure (headers, code blocks, lists)
    while maintaining semantic boundaries.
    """

    def __init__(self, chunk_size: int = 2048, overlap: int = 200):
        """
        Initialize document chunker.

        Args:
            chunk_size: Number of characters per chunk (default: 2048)
            overlap: Number of overlapping characters between chunks (default: 200)
        """
        self.chunk_size = chunk_size
        self.overlap = overlap

        # Initialize LangChain MarkdownTextSplitter
        # Optimized for Markdown content with special handling for:
        # - Headers (# ## ###)
        # - Code blocks (``` ```)
        # - Lists (- * 1.)
        # - Horizontal rules (---)
        # - Paragraphs and sentences
        # This preserves both markdown structure and semantic boundaries
        self.splitter = MarkdownTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap,
            add_start_index=True,  # Enable position tracking
            strip_whitespace=True,
        )

    def chunk_text(self, content: str) -> list[ChunkWithPosition]:
        """
        Split text into overlapping chunks with position tracking.

        Uses LangChain's MarkdownTextSplitter to create chunks that respect
        both markdown structure and semantic boundaries. Optimized for Nextcloud
        Notes content with special handling for headers, code blocks, lists, etc.
        Preserves character positions for each chunk to enable precise document
        retrieval.

        Args:
            content: Markdown text content to chunk

        Returns:
            List of chunks with their character positions in the original content
        """
        # Handle empty content - return single empty chunk for backward compatibility
        if not content:
            return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]

        # Use LangChain to create documents with position tracking
        docs = self.splitter.create_documents([content])

        # Convert LangChain Documents to ChunkWithPosition objects
        chunks = [
            ChunkWithPosition(
                text=doc.page_content,
                start_offset=doc.metadata.get("start_index", 0),
                end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content),
            )
            for doc in docs
        ]

        logger.debug(
            f"Chunked document into {len(chunks)} chunks "
            f"(chunk_size={self.chunk_size}, overlap={self.overlap})"
        )
        return chunks