feat: add chunk position tracking to vector indexing and search

Track character offsets (start_offset, end_offset) for each chunk in vector database metadata, enabling precise chunk highlighting in visualization pane. Changes: - processor.py: Store chunk_start_offset and chunk_end_offset in Qdrant metadata - processor.py: Added metadata_version=2 to indicate position tracking support - search/semantic.py: Return chunk positions from search results - server/semantic.py: Expose chunk positions in API responses (SemanticSearchResult) Enables viz pane to: 1. Display exact matched chunk with surrounding context 2. Highlight the precise portion of text that matched the query 3. Build user trust by showing what the RAG system actually retrieved Position tracking uses ChunkWithPosition dataclass from document_chunker.py which provides character-accurate offsets in the original document. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 06:47:58 +01:00
parent c3282534eb
commit 3aa7128f45
3 changed files with 29 additions and 10 deletions
@@ -233,13 +233,16 @@ async def _index_document(
    )
    chunks = chunker.chunk_text(content)

+    # Extract chunk texts for embedding
+    chunk_texts = [chunk.text for chunk in chunks]
+
    # Generate dense embeddings (I/O bound - external API call)
    embedding_service = get_embedding_service()
-    dense_embeddings = await embedding_service.embed_batch(chunks)
+    dense_embeddings = await embedding_service.embed_batch(chunk_texts)

    # Generate sparse embeddings (BM25 for keyword matching)
    bm25_service = get_bm25_service()
-    sparse_embeddings = bm25_service.encode_batch(chunks)
+    sparse_embeddings = bm25_service.encode_batch(chunk_texts)

    # Prepare Qdrant points
    indexed_at = int(time.time())
@@ -265,12 +268,15 @@ async def _index_document(
                    "doc_id": doc_task.doc_id,
                    "doc_type": doc_task.doc_type,
                    "title": title,
-                    "excerpt": chunk[:200],
+                    "excerpt": chunk.text[:200],
                    "indexed_at": indexed_at,
                    "modified_at": doc_task.modified_at,
                    "etag": etag,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
+                    "chunk_start_offset": chunk.start_offset,
+                    "chunk_end_offset": chunk.end_offset,
+                    "metadata_version": 2,  # v2 includes position metadata
                },
            )
        )