feat(search): add file_path metadata and chunk offsets to search results

Changes:
- Add file_path to metadata in semantic and BM25 hybrid search algorithms
  for PDF viewer integration (search/semantic.py:161-163, search/bm25_hybrid.py:230-232)
- Include chunk_start_offset, chunk_end_offset, page_number, and page_count
  in search results for rich chunk display (api/management.py:981-1004)
- Add point_id field to SearchResult for batch retrieval (models/semantic.py)
- Fix type narrowing for chunk context API parameters (api/management.py:1102-1111)
- Fix None-safety in doc_types discovery (search/algorithms.py:114)

This enables the Astroglobe UI to display PDF pages at the correct
location for matched chunks.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-12-15 21:31:10 +01:00
parent a026f2eddb
commit 85db90a2df
5 changed files with 504 additions and 35 deletions
+3 -1
View File
@@ -111,7 +111,7 @@ async def get_indexed_doc_types(user_id: str) -> set[str]:
doc_types: set[str] = {
str(point.payload.get("doc_type"))
for point in scroll_results
if point.payload.get("doc_type")
if point.payload and point.payload.get("doc_type")
}
logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
@@ -138,6 +138,7 @@ class SearchResult:
chunk_start_offset: Character position where chunk starts (None if not available)
chunk_end_offset: Character position where chunk ends (None if not available)
page_number: Page number for PDF documents (None for other doc types)
page_count: Total number of pages in PDF document (None for other doc types)
chunk_index: Zero-based index of this chunk in the document
total_chunks: Total number of chunks in the document
point_id: Qdrant point ID for batch vector retrieval (None if not from Qdrant)
@@ -152,6 +153,7 @@ class SearchResult:
chunk_start_offset: int | None = None
chunk_end_offset: int | None = None
page_number: int | None = None
page_count: int | None = None
chunk_index: int = 0
total_chunks: int = 1
point_id: str | None = None