added envs for unstructured to control OCR quality and OCR languages

This commit is contained in:
yuisheaven
2025-10-04 05:21:02 +02:00
parent df5f85e0c6
commit c9a687171a
5 changed files with 96 additions and 6 deletions
@@ -6,7 +6,11 @@ from typing import Optional, Tuple
import httpx
from nextcloud_mcp_server.config import get_unstructured_api_url
from nextcloud_mcp_server.config import (
get_unstructured_api_url,
get_unstructured_languages,
get_unstructured_strategy,
)
logger = logging.getLogger(__name__)
@@ -43,7 +47,7 @@ class UnstructuredClient:
content: bytes,
filename: str,
content_type: Optional[str] = None,
strategy: str = "auto",
strategy: Optional[str] = None,
languages: Optional[list[str]] = None,
extract_image_block_types: Optional[list[str]] = None,
) -> Tuple[str, dict]:
@@ -53,8 +57,10 @@ class UnstructuredClient:
content: The document content as bytes
filename: The filename (used for format detection)
content_type: Optional MIME type
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based)
languages: List of language codes for OCR (e.g., ["eng", "deu"])
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
If None, uses the value from UNSTRUCTURED_STRATEGY env var.
languages: List of language codes for OCR (e.g., ["eng", "deu"]).
If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
extract_image_block_types: Types of elements to extract from images
Returns:
@@ -66,8 +72,12 @@ class UnstructuredClient:
httpx.HTTPError: If the API request fails
Exception: If parsing fails
"""
# Use environment configuration as defaults
if strategy is None:
strategy = get_unstructured_strategy()
if languages is None:
languages = ["eng"] # Default to English
languages = get_unstructured_languages()
# Prepare the multipart form data
files = {