feat(news): add Nextcloud News app integration

Add full integration for the Nextcloud News (RSS/Atom reader) app: - Add NewsClient with complete CRUD operations for folders, feeds, and items - Add 8 read-only MCP tools for listing/getting folders, feeds, items - Add Pydantic models for News entities with camelCase alias support - Add vector sync support for starred + unread items - Add HTML to Markdown converter using markdownify for better embeddings - Add Docker post-install hook to enable News app - Add 25 unit tests for NewsClient API methods Vector sync indexes starred and unread items, providing a balanced approach that captures important (starred) and current (unread) content without indexing the entire article history. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 14:30:23 +01:00
parent 1b1667bc2b
commit a33f6a2f15
15 changed files with 2055 additions and 7 deletions
@@ -0,0 +1,49 @@
+"""HTML to Markdown conversion utilities for vector sync."""
+
+import logging
+
+from markdownify import markdownify as md
+
+logger = logging.getLogger(__name__)
+
+
+def html_to_markdown(html_content: str | None) -> str:
+    """Convert HTML content to Markdown, preserving semantic structure.
+
+    This function converts HTML (typically from RSS/Atom feed items) to Markdown
+    for better text embedding. Markdown preserves:
+    - Heading hierarchy (important for document structure)
+    - Lists (bullet and numbered)
+    - Links (as [text](url))
+    - Bold/italic emphasis
+    - Paragraphs and line breaks
+
+    Args:
+        html_content: HTML string to convert (may be None or empty)
+
+    Returns:
+        Markdown string, or empty string if input is None/empty
+
+    Example:
+        >>> html_to_markdown("<h1>Title</h1><p>Content with <b>bold</b>.</p>")
+        '# Title\\n\\nContent with **bold**.\\n\\n'
+    """
+    if not html_content:
+        return ""
+
+    try:
+        markdown = md(
+            html_content,
+            heading_style="ATX",  # Use # style headings
+            strip=["script", "style", "iframe", "noscript"],  # Remove unsafe elements
+            bullets="-",  # Use - for unordered lists
+            code_language="",  # Don't add language hints to code blocks
+        )
+        return markdown.strip()
+    except Exception as e:
+        logger.warning(f"Failed to convert HTML to Markdown: {e}")
+        # Fallback: strip all HTML tags as a last resort
+        import re
+
+        text = re.sub(r"<[^>]+>", " ", html_content)
+        return " ".join(text.split())  # Normalize whitespace
@@ -272,6 +272,45 @@ async def _index_document(
            file_path = None  # Notes don't have file paths
            content_bytes = None  # Notes don't have binary content
            content_type = None
+        elif doc_task.doc_type == "news_item":
+            from nextcloud_mcp_server.vector.html_processor import html_to_markdown
+
+            item = await nc_client.news.get_item(int(doc_task.doc_id))
+            # Convert HTML body to Markdown for better embedding
+            body_markdown = html_to_markdown(item.get("body", ""))
+            # Build content: title + URL + body
+            item_title = item.get("title", "")
+            item_url = item.get("url", "")
+            feed_title = item.get("feedTitle", "")
+
+            # Structure content for embedding
+            content_parts = [item_title]
+            if feed_title:
+                content_parts.append(f"Source: {feed_title}")
+            if item_url:
+                content_parts.append(f"URL: {item_url}")
+            content_parts.append("")  # Blank line
+            content_parts.append(body_markdown)
+            content = "\n".join(content_parts)
+
+            title = item_title
+            etag = item.get("guidHash", "")
+            # Store news-specific metadata for later use in payload
+            file_metadata = {
+                "feed_id": item.get("feedId"),
+                "feed_title": feed_title,
+                "author": item.get("author"),
+                "pub_date": item.get("pubDate"),
+                "starred": item.get("starred", False),
+                "unread": item.get("unread", True),
+                "url": item_url,
+                "guid_hash": item.get("guidHash"),
+                "enclosure_link": item.get("enclosureLink"),
+                "enclosure_mime": item.get("enclosureMime"),
+            }
+            file_path = None
+            content_bytes = None
+            content_type = None
        elif doc_task.doc_type == "file":
            # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
            if not doc_task.file_path:
@@ -358,15 +397,16 @@ async def _index_document(
        chunks = await chunker.chunk_text(content)

    # Assign page numbers to chunks if page boundaries are available (PDFs)
-    if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
+    page_boundaries = file_metadata.get("page_boundaries")
+    if doc_task.doc_type == "file" and page_boundaries is not None:
        with trace_operation(
            "vector_sync.assign_page_numbers",
            attributes={
                "vector_sync.chunk_count": len(chunks),
-                "vector_sync.page_count": len(file_metadata["page_boundaries"]),
+                "vector_sync.page_count": len(page_boundaries),
            },
        ):
-            assign_page_numbers(chunks, file_metadata["page_boundaries"])
+            assign_page_numbers(chunks, page_boundaries)

            # Diagnostic: Verify page number assignment
            assigned_count = sum(1 for c in chunks if c.page_number is not None)
@@ -389,8 +429,8 @@ async def _index_document(
                    f"Text length: {len(content)}, "
                    f"Chunks: {len(chunks)}, "
                    f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
-                    f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
-                    f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
+                    f"Page boundaries: {len(page_boundaries)} pages, "
+                    f"First boundary: {page_boundaries[0] if page_boundaries else 'None'}"
                )

    # Extract chunk texts for embedding
@@ -566,6 +606,23 @@ async def _index_document(
                        if doc_task.doc_type == "file"
                        else {}
                    ),
+                    # News item-specific metadata
+                    **(
+                        {
+                            "feed_id": file_metadata.get("feed_id"),
+                            "feed_title": file_metadata.get("feed_title"),
+                            "author": file_metadata.get("author"),
+                            "pub_date": file_metadata.get("pub_date"),
+                            "starred": file_metadata.get("starred"),
+                            "unread": file_metadata.get("unread"),
+                            "url": file_metadata.get("url"),
+                            "guid_hash": file_metadata.get("guid_hash"),
+                            "enclosure_link": file_metadata.get("enclosure_link"),
+                            "enclosure_mime": file_metadata.get("enclosure_mime"),
+                        }
+                        if doc_task.doc_type == "news_item"
+                        else {}
+                    ),
                    # Highlighted page image (PDF only)
                    **(
                        {
@@ -544,9 +544,217 @@ async def scan_user_documents(

        queued += file_queued

+        # Scan News items (starred + unread)
+        news_queued = 0
+        try:
+            news_queued = await scan_news_items(
+                user_id=user_id,
+                send_stream=send_stream,
+                nc_client=nc_client,
+                initial_sync=initial_sync,
+                scan_id=scan_id,
+            )
+            queued += news_queued
+        except Exception as e:
+            logger.warning(f"Failed to scan news items for {user_id}: {e}")
+
        if queued > 0:
            logger.info(
-                f"Sent {queued} documents ({file_queued} files) for incremental sync: {user_id}"
+                f"Sent {queued} documents ({file_queued} files, {news_queued} news items) for incremental sync: {user_id}"
            )
        else:
            logger.debug(f"No changes detected for {user_id}")
+
+
+async def scan_news_items(
+    user_id: str,
+    send_stream: MemoryObjectSendStream[DocumentTask],
+    nc_client: NextcloudClient,
+    initial_sync: bool,
+    scan_id: int,
+) -> int:
+    """
+    Scan user's News items (starred + unread) and queue changed items.
+
+    Indexes starred and unread items for semantic search. This provides
+    a balanced approach - important items (starred) and current items
+    (unread) are searchable, while avoiding indexing the entire history.
+
+    Args:
+        user_id: User to scan
+        send_stream: Stream to send changed documents to processors
+        nc_client: Authenticated Nextcloud client
+        initial_sync: If True, send all documents (first-time sync)
+        scan_id: Scan identifier for logging
+
+    Returns:
+        Number of items queued for processing
+    """
+    from nextcloud_mcp_server.client.news import NewsItemType
+
+    settings = get_settings()
+    queued = 0
+
+    # Get indexed news item IDs from Qdrant (for deletion tracking)
+    indexed_item_ids: set[str] = set()
+    if not initial_sync:
+        qdrant_client = await get_qdrant_client()
+        scroll_result = await qdrant_client.scroll(
+            collection_name=settings.get_collection_name(),
+            scroll_filter=Filter(
+                must=[
+                    FieldCondition(key="user_id", match=MatchValue(value=user_id)),
+                    FieldCondition(key="doc_type", match=MatchValue(value="news_item")),
+                ]
+            ),
+            with_payload=["doc_id"],
+            with_vectors=False,
+            limit=10000,
+        )
+        indexed_item_ids = {point.payload["doc_id"] for point in scroll_result[0]}
+        logger.debug(f"Found {len(indexed_item_ids)} indexed news items in Qdrant")
+
+    # Fetch starred items (type=STARRED)
+    starred_items = await nc_client.news.get_items(
+        batch_size=-1,  # Get all
+        type_=NewsItemType.STARRED,
+        get_read=True,  # Include read starred items
+    )
+    logger.debug(f"[SCAN-{scan_id}] Found {len(starred_items)} starred news items")
+
+    # Fetch unread items (type=ALL, get_read=False)
+    unread_items = await nc_client.news.get_items(
+        batch_size=-1,
+        type_=NewsItemType.ALL,
+        get_read=False,  # Only unread
+    )
+    logger.debug(f"[SCAN-{scan_id}] Found {len(unread_items)} unread news items")
+
+    # Combine and deduplicate (an item can be both starred and unread)
+    items_by_id: dict[int, dict] = {}
+    for item in starred_items:
+        items_by_id[item["id"]] = item
+    for item in unread_items:
+        items_by_id[item["id"]] = item
+
+    item_count = len(items_by_id)
+    nextcloud_item_ids: set[str] = set()
+
+    for item_id, item in items_by_id.items():
+        doc_id = str(item_id)
+        nextcloud_item_ids.add(doc_id)
+
+        # Use lastModified timestamp (microseconds in News API)
+        modified_at = item.get("lastModified", 0)
+        # Convert to seconds if needed (News API uses microseconds)
+        if modified_at > 10000000000:  # > year 2286 in seconds
+            modified_at = modified_at // 1000000
+
+        if initial_sync:
+            # Send everything on first sync - write placeholder first
+            await write_placeholder_point(
+                doc_id=doc_id,
+                doc_type="news_item",
+                user_id=user_id,
+                modified_at=modified_at,
+            )
+            await send_stream.send(
+                DocumentTask(
+                    user_id=user_id,
+                    doc_id=doc_id,
+                    doc_type="news_item",
+                    operation="index",
+                    modified_at=modified_at,
+                )
+            )
+            queued += 1
+        else:
+            # Incremental sync: check if item exists and compare modified_at
+            doc_key = (user_id, doc_id)
+            if doc_key in _potentially_deleted:
+                logger.debug(
+                    f"News item {doc_id} reappeared, removing from deletion grace period"
+                )
+                del _potentially_deleted[doc_key]
+
+            # Query Qdrant for existing entry
+            existing_metadata = await query_document_metadata(
+                doc_id=doc_id, doc_type="news_item", user_id=user_id
+            )
+
+            needs_indexing = False
+            if existing_metadata is None:
+                needs_indexing = True
+            elif existing_metadata.get("modified_at", 0) < modified_at:
+                needs_indexing = True
+            elif existing_metadata.get("is_placeholder", False):
+                queued_at = existing_metadata.get("queued_at", 0)
+                placeholder_age = time.time() - queued_at
+                stale_threshold = settings.vector_sync_scan_interval * 5
+                if placeholder_age > stale_threshold:
+                    logger.debug(
+                        f"Found stale placeholder for news item {doc_id} "
+                        f"(age={placeholder_age:.1f}s), requeuing"
+                    )
+                    needs_indexing = True
+
+            if needs_indexing:
+                await write_placeholder_point(
+                    doc_id=doc_id,
+                    doc_type="news_item",
+                    user_id=user_id,
+                    modified_at=modified_at,
+                )
+                await send_stream.send(
+                    DocumentTask(
+                        user_id=user_id,
+                        doc_id=doc_id,
+                        doc_type="news_item",
+                        operation="index",
+                        modified_at=modified_at,
+                    )
+                )
+                queued += 1
+
+    logger.info(
+        f"[SCAN-{scan_id}] Found {item_count} news items (starred+unread) for {user_id}"
+    )
+    record_vector_sync_scan(item_count)
+
+    # Check for deleted items (not initial sync)
+    # Items become "deleted" when they are no longer starred AND become read
+    if not initial_sync:
+        grace_period = settings.vector_sync_scan_interval * 1.5
+        current_time = time.time()
+
+        for doc_id in indexed_item_ids:
+            if doc_id not in nextcloud_item_ids:
+                doc_key = (user_id, doc_id)
+
+                if doc_key in _potentially_deleted:
+                    first_missing_time = _potentially_deleted[doc_key]
+                    time_missing = current_time - first_missing_time
+
+                    if time_missing >= grace_period:
+                        logger.info(
+                            f"News item {doc_id} missing for {time_missing:.1f}s "
+                            f"(>{grace_period:.1f}s grace period), sending deletion"
+                        )
+                        await send_stream.send(
+                            DocumentTask(
+                                user_id=user_id,
+                                doc_id=doc_id,
+                                doc_type="news_item",
+                                operation="delete",
+                                modified_at=0,
+                            )
+                        )
+                        queued += 1
+                        del _potentially_deleted[doc_key]
+                else:
+                    logger.debug(
+                        f"News item {doc_id} missing for first time, starting grace period"
+                    )
+                    _potentially_deleted[doc_key] = current_time
+
+    return queued