feat(news): add Nextcloud News app integration

Add full integration for the Nextcloud News (RSS/Atom reader) app: - Add NewsClient with complete CRUD operations for folders, feeds, and items - Add 8 read-only MCP tools for listing/getting folders, feeds, items - Add Pydantic models for News entities with camelCase alias support - Add vector sync support for starred + unread items - Add HTML to Markdown converter using markdownify for better embeddings - Add Docker post-install hook to enable News app - Add 25 unit tests for NewsClient API methods Vector sync indexes starred and unread items, providing a balanced approach that captures important (starred) and current (unread) content without indexing the entire article history. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 14:30:23 +01:00
parent 1b1667bc2b
commit a33f6a2f15
15 changed files with 2055 additions and 7 deletions
@@ -272,6 +272,45 @@ async def _index_document(
            file_path = None  # Notes don't have file paths
            content_bytes = None  # Notes don't have binary content
            content_type = None
+        elif doc_task.doc_type == "news_item":
+            from nextcloud_mcp_server.vector.html_processor import html_to_markdown
+
+            item = await nc_client.news.get_item(int(doc_task.doc_id))
+            # Convert HTML body to Markdown for better embedding
+            body_markdown = html_to_markdown(item.get("body", ""))
+            # Build content: title + URL + body
+            item_title = item.get("title", "")
+            item_url = item.get("url", "")
+            feed_title = item.get("feedTitle", "")
+
+            # Structure content for embedding
+            content_parts = [item_title]
+            if feed_title:
+                content_parts.append(f"Source: {feed_title}")
+            if item_url:
+                content_parts.append(f"URL: {item_url}")
+            content_parts.append("")  # Blank line
+            content_parts.append(body_markdown)
+            content = "\n".join(content_parts)
+
+            title = item_title
+            etag = item.get("guidHash", "")
+            # Store news-specific metadata for later use in payload
+            file_metadata = {
+                "feed_id": item.get("feedId"),
+                "feed_title": feed_title,
+                "author": item.get("author"),
+                "pub_date": item.get("pubDate"),
+                "starred": item.get("starred", False),
+                "unread": item.get("unread", True),
+                "url": item_url,
+                "guid_hash": item.get("guidHash"),
+                "enclosure_link": item.get("enclosureLink"),
+                "enclosure_mime": item.get("enclosureMime"),
+            }
+            file_path = None
+            content_bytes = None
+            content_type = None
        elif doc_task.doc_type == "file":
            # For files, doc_id is now the numeric file ID, file_path comes from DocumentTask
            if not doc_task.file_path:
@@ -358,15 +397,16 @@ async def _index_document(
        chunks = await chunker.chunk_text(content)

    # Assign page numbers to chunks if page boundaries are available (PDFs)
-    if doc_task.doc_type == "file" and "page_boundaries" in file_metadata:
+    page_boundaries = file_metadata.get("page_boundaries")
+    if doc_task.doc_type == "file" and page_boundaries is not None:
        with trace_operation(
            "vector_sync.assign_page_numbers",
            attributes={
                "vector_sync.chunk_count": len(chunks),
-                "vector_sync.page_count": len(file_metadata["page_boundaries"]),
+                "vector_sync.page_count": len(page_boundaries),
            },
        ):
-            assign_page_numbers(chunks, file_metadata["page_boundaries"])
+            assign_page_numbers(chunks, page_boundaries)

            # Diagnostic: Verify page number assignment
            assigned_count = sum(1 for c in chunks if c.page_number is not None)
@@ -389,8 +429,8 @@ async def _index_document(
                    f"Text length: {len(content)}, "
                    f"Chunks: {len(chunks)}, "
                    f"Chunk offset range: [{chunks[0].start_offset}:{chunks[-1].end_offset}], "
-                    f"Page boundaries: {len(file_metadata['page_boundaries'])} pages, "
-                    f"First boundary: {file_metadata['page_boundaries'][0] if file_metadata['page_boundaries'] else 'None'}"
+                    f"Page boundaries: {len(page_boundaries)} pages, "
+                    f"First boundary: {page_boundaries[0] if page_boundaries else 'None'}"
                )

    # Extract chunk texts for embedding
@@ -566,6 +606,23 @@ async def _index_document(
                        if doc_task.doc_type == "file"
                        else {}
                    ),
+                    # News item-specific metadata
+                    **(
+                        {
+                            "feed_id": file_metadata.get("feed_id"),
+                            "feed_title": file_metadata.get("feed_title"),
+                            "author": file_metadata.get("author"),
+                            "pub_date": file_metadata.get("pub_date"),
+                            "starred": file_metadata.get("starred"),
+                            "unread": file_metadata.get("unread"),
+                            "url": file_metadata.get("url"),
+                            "guid_hash": file_metadata.get("guid_hash"),
+                            "enclosure_link": file_metadata.get("enclosure_link"),
+                            "enclosure_mime": file_metadata.get("enclosure_mime"),
+                        }
+                        if doc_task.doc_type == "news_item"
+                        else {}
+                    ),
                    # Highlighted page image (PDF only)
                    **(
                        {