added envs for unstructured to control OCR quality and OCR languages

This commit is contained in:
yuisheaven
2025-10-04 05:21:02 +02:00
parent df5f85e0c6
commit c9a687171a
5 changed files with 96 additions and 6 deletions
+2
View File
@@ -65,6 +65,8 @@ services:
- NEXTCLOUD_PASSWORD=admin
- ENABLE_UNSTRUCTURED_PARSING=true
- UNSTRUCTURED_API_URL=http://unstructured:8000
- UNSTRUCTURED_STRATEGY=hi_res
- UNSTRUCTURED_LANGUAGES=deu,eng
#volumes:
#- ./nextcloud_mcp_server:/app/nextcloud_mcp_server:ro
+17
View File
@@ -8,3 +8,20 @@ ENABLE_UNSTRUCTURED_PARSING=true
# Unstructured API endpoint (default for docker-compose setup)
UNSTRUCTURED_API_URL=http://unstructured:8000
# Parsing strategy for the Unstructured service
# Valid values: auto, fast, hi_res
# - auto: Automatically choose the best strategy based on document type
# - fast: Fast parsing without OCR - best for simple text documents
# - hi_res: High-resolution parsing with OCR - best for scanned documents, images, and complex layouts (default)
UNSTRUCTURED_STRATEGY=hi_res
# Languages for OCR and document parsing (comma-separated ISO 639-3 language codes)
# Default: eng,deu (English and German)
# Common language codes:
# eng = English deu = German fra = French
# spa = Spanish ita = Italian por = Portuguese
# rus = Russian ara = Arabic zho = Chinese
# jpn = Japanese kor = Korean
# Example for English, German, and French: UNSTRUCTURED_LANGUAGES=eng,deu,fra
UNSTRUCTURED_LANGUAGES=eng,deu
@@ -6,7 +6,11 @@ from typing import Optional, Tuple
import httpx
from nextcloud_mcp_server.config import get_unstructured_api_url
from nextcloud_mcp_server.config import (
get_unstructured_api_url,
get_unstructured_languages,
get_unstructured_strategy,
)
logger = logging.getLogger(__name__)
@@ -43,7 +47,7 @@ class UnstructuredClient:
content: bytes,
filename: str,
content_type: Optional[str] = None,
strategy: str = "auto",
strategy: Optional[str] = None,
languages: Optional[list[str]] = None,
extract_image_block_types: Optional[list[str]] = None,
) -> Tuple[str, dict]:
@@ -53,8 +57,10 @@ class UnstructuredClient:
content: The document content as bytes
filename: The filename (used for format detection)
content_type: Optional MIME type
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based)
languages: List of language codes for OCR (e.g., ["eng", "deu"])
strategy: Parsing strategy - "auto", "fast", or "hi_res" (OCR-based).
If None, uses the value from UNSTRUCTURED_STRATEGY env var.
languages: List of language codes for OCR (e.g., ["eng", "deu"]).
If None, uses the value from UNSTRUCTURED_LANGUAGES env var.
extract_image_block_types: Types of elements to extract from images
Returns:
@@ -66,8 +72,12 @@ class UnstructuredClient:
httpx.HTTPError: If the API request fails
Exception: If parsing fails
"""
# Use environment configuration as defaults
if strategy is None:
strategy = get_unstructured_strategy()
if languages is None:
languages = ["eng"] # Default to English
languages = get_unstructured_languages()
# Prepare the multipart form data
files = {
+60
View File
@@ -60,3 +60,63 @@ def is_unstructured_parsing_enabled() -> bool:
True if enabled, False otherwise.
"""
return os.getenv("ENABLE_UNSTRUCTURED_PARSING", "true").lower() == "true"
def get_unstructured_strategy() -> str:
"""Get the parsing strategy for the Unstructured API.
Valid values are:
- 'auto': Automatically choose the best strategy (default)
- 'fast': Fast parsing without OCR
- 'hi_res': High-resolution parsing with OCR for better accuracy
Returns:
The parsing strategy to use.
"""
strategy = os.getenv("UNSTRUCTURED_STRATEGY", "auto").lower()
valid_strategies = ["auto", "fast", "hi_res"]
if strategy not in valid_strategies:
logging.warning(
f"Invalid UNSTRUCTURED_STRATEGY '{strategy}'. Using 'hi_res'. "
f"Valid options: {', '.join(valid_strategies)}"
)
return "hi_res"
return strategy
def get_unstructured_languages() -> list[str]:
"""Get the OCR languages for the Unstructured API.
Languages should be specified as ISO 639-3 codes (e.g., 'eng', 'deu', 'fra').
Multiple languages can be specified separated by commas.
Default languages: English (eng) and German (deu)
Common language codes:
- eng: English
- deu: German
- fra: French
- spa: Spanish
- ita: Italian
- por: Portuguese
- rus: Russian
- ara: Arabic
- zho: Chinese
- jpn: Japanese
- kor: Korean
Returns:
List of language codes for OCR processing.
"""
languages_str = os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu")
# Split by comma and clean up whitespace
languages = [lang.strip() for lang in languages_str.split(",") if lang.strip()]
if not languages:
logging.warning("No languages specified in UNSTRUCTURED_LANGUAGES. Using default: eng,deu")
return ["eng", "deu"]
return languages
@@ -86,11 +86,12 @@ async def parse_document(
try:
from nextcloud_mcp_server.client.unstructured_client import UnstructuredClient
client = UnstructuredClient()
# The client will automatically use environment configuration
# (UNSTRUCTURED_STRATEGY and UNSTRUCTURED_LANGUAGES)
return await client.partition_document(
content=content,
filename=filename or f"document.{doc_type}",
content_type=content_type,
strategy="auto"
)
except Exception as e:
logger.error(f"Unstructured API parsing failed: {e}")