From 35f3824932833fe77ef3bce54b86803cda4838a6 Mon Sep 17 00:00:00 2001
From: Mazurek Michal <michal.mazurek@a1.at>
Date: Fri, 7 Feb 2025 13:44:47 +0100
Subject: [PATCH] feat: Implement Document Intelligence as Content Extraction
 Engine

---
 backend/open_webui/config.py                  | 12 +++++++
 backend/open_webui/main.py                    |  4 +++
 backend/open_webui/retrieval/loaders/main.py  | 22 ++++++++++++
 backend/open_webui/routers/retrieval.py       | 27 +++++++++++++-
 backend/requirements.txt                      |  1 +
 pyproject.toml                                |  1 +
 src/lib/apis/retrieval/index.ts               |  6 ++++
 .../admin/Settings/Documents.svelte           | 36 ++++++++++++++++++-
 8 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py
index bf6f1d025..e46a87cd5 100644
--- a/backend/open_webui/config.py
+++ b/backend/open_webui/config.py
@@ -1431,6 +1431,18 @@ TIKA_SERVER_URL = PersistentConfig(
     os.getenv("TIKA_SERVER_URL", "http://tika:9998"),  # Default for sidecar deployment
 )
 
+DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
+    "DOCUMENT_INTELLIGENCE_ENDPOINT",
+    "rag.document_intelligence_endpoint",
+    os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""),
+)
+
+DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
+    "DOCUMENT_INTELLIGENCE_KEY",
+    "rag.document_intelligence_key",
+    os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
+)
+
 RAG_TOP_K = PersistentConfig(
     "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
 )
diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py
index 863f58dea..2f1b92b1d 100644
--- a/backend/open_webui/main.py
+++ b/backend/open_webui/main.py
@@ -154,6 +154,8 @@ from open_webui.config import (
     CHUNK_SIZE,
     CONTENT_EXTRACTION_ENGINE,
     TIKA_SERVER_URL,
+    DOCUMENT_INTELLIGENCE_ENDPOINT,
+    DOCUMENT_INTELLIGENCE_KEY,
     RAG_TOP_K,
     RAG_TEXT_SPLITTER,
     TIKTOKEN_ENCODING_NAME,
@@ -478,6 +480,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
 
 app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
 app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
+app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
+app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
 
 app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
 app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME
diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py
index a9372f65a..19d590f5c 100644
--- a/backend/open_webui/retrieval/loaders/main.py
+++ b/backend/open_webui/retrieval/loaders/main.py
@@ -4,6 +4,7 @@ import ftfy
 import sys
 
 from langchain_community.document_loaders import (
+    AzureAIDocumentIntelligenceLoader,
     BSHTMLLoader,
     CSVLoader,
     Docx2txtLoader,
@@ -147,6 +148,27 @@ class Loader:
                     file_path=file_path,
                     mime_type=file_content_type,
                 )
+        elif (
+            self.engine == "document_intelligence"
+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
+            and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
+            and (
+                file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
+                or file_content_type
+                in [
+                    "application/vnd.ms-excel",
+                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    "application/vnd.ms-powerpoint",
+                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                ]
+            )
+        ):
+            loader = AzureAIDocumentIntelligenceLoader(
+                file_path=file_path,
+                api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
+                api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
+            )
         else:
             if file_ext == "pdf":
                 loader = PyPDFLoader(
diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py
index 77f04a4be..4cfcd490d 100644
--- a/backend/open_webui/routers/retrieval.py
+++ b/backend/open_webui/routers/retrieval.py
@@ -352,6 +352,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
         "content_extraction": {
             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "document_intelligence_config": {
+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
+            },
         },
         "chunk": {
             "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -402,9 +406,15 @@ class FileConfig(BaseModel):
     max_count: Optional[int] = None
 
 
+class DocumentIntelligenceConfigForm(BaseModel):
+    endpoint: str
+    key: str
+
+
 class ContentExtractionConfig(BaseModel):
     engine: str = ""
     tika_server_url: Optional[str] = None
+    document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
 
 
 class ChunkParamUpdateForm(BaseModel):
@@ -479,13 +489,22 @@ async def update_rag_config(
         request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
 
     if form_data.content_extraction is not None:
-        log.info(f"Updating text settings: {form_data.content_extraction}")
+        log.info(
+            f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
+        )
         request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
             form_data.content_extraction.engine
         )
         request.app.state.config.TIKA_SERVER_URL = (
             form_data.content_extraction.tika_server_url
         )
+        if form_data.content_extraction.document_intelligence_config is not None:
+            request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
+                form_data.content_extraction.document_intelligence_config.endpoint
+            )
+            request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
+                form_data.content_extraction.document_intelligence_config.key
+            )
 
     if form_data.chunk is not None:
         request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@@ -564,6 +583,10 @@ async def update_rag_config(
         "content_extraction": {
             "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
             "tika_server_url": request.app.state.config.TIKA_SERVER_URL,
+            "document_intelligence_config": {
+                "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
+            },
         },
         "chunk": {
             "text_splitter": request.app.state.config.TEXT_SPLITTER,
@@ -887,6 +910,8 @@ def process_file(
                     engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
                     TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
                     PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
+                    DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
+                    DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
                 )
                 docs = loader.load(
                     file.filename, file.meta.get("content_type"), file_path
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 14ad4b9cd..4a39e77b5 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -72,6 +72,7 @@ validators==0.34.0
 psutil
 sentencepiece
 soundfile==0.13.1
+azure-ai-documentintelligence==1.0.0
 
 opencv-python-headless==4.11.0.86
 rapidocr-onnxruntime==1.3.24
diff --git a/pyproject.toml b/pyproject.toml
index f121089e8..60d54afd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,6 +77,7 @@ dependencies = [
     "psutil",
     "sentencepiece",
     "soundfile==0.13.1",
+    "azure-ai-documentintelligence==1.0.0",
 
     "opencv-python-headless==4.11.0.86",
     "rapidocr-onnxruntime==1.3.24",
diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts
index c35c37847..ed07ab5d0 100644
--- a/src/lib/apis/retrieval/index.ts
+++ b/src/lib/apis/retrieval/index.ts
@@ -32,9 +32,15 @@ type ChunkConfigForm = {
 	chunk_overlap: number;
 };
 
+type DocumentIntelligenceConfigForm = {
+	key: string;
+	endpoint: string;
+};
+
 type ContentExtractConfigForm = {
 	engine: string;
 	tika_server_url: string | null;
+	document_intelligence_config: DocumentIntelligenceConfigForm | null;
 };
 
 type YoutubeConfigForm = {
diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte
index d3b7cfa01..e624a51b3 100644
--- a/src/lib/components/admin/Settings/Documents.svelte
+++ b/src/lib/components/admin/Settings/Documents.svelte
@@ -50,6 +50,9 @@
 	let contentExtractionEngine = 'default';
 	let tikaServerUrl = '';
 	let showTikaServerUrl = false;
+	let documentIntelligenceEndpoint = '';
+	let documentIntelligenceKey = '';
+	let showDocumentIntelligenceConfig = false;
 
 	let textSplitter = '';
 	let chunkSize = 0;
@@ -175,6 +178,13 @@
 			toast.error($i18n.t('Tika Server URL required.'));
 			return;
 		}
+		if (
+			contentExtractionEngine === 'document_intelligence' &&
+			(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
+		) {
+			toast.error($i18n.t('Document Intelligence endpoint and key required.'));
+			return;
+		}
 		const res = await updateRAGConfig(localStorage.token, {
 			pdf_extract_images: pdfExtractImages,
 			enable_google_drive_integration: enableGoogleDriveIntegration,
@@ -189,7 +199,11 @@
 			},
 			content_extraction: {
 				engine: contentExtractionEngine,
-				tika_server_url: tikaServerUrl
+				tika_server_url: tikaServerUrl,
+				document_intelligence_config: {
+					key: documentIntelligenceKey,
+					endpoint: documentIntelligenceEndpoint
+				}
 			}
 		});
 
@@ -245,6 +259,9 @@
 			contentExtractionEngine = res.content_extraction.engine;
 			tikaServerUrl = res.content_extraction.tika_server_url;
 			showTikaServerUrl = contentExtractionEngine === 'tika';
+			documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
+			documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
+			showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
 
 			fileMaxSize = res?.file.max_size ?? '';
 			fileMaxCount = res?.file.max_count ?? '';
@@ -568,10 +585,12 @@
 						bind:value={contentExtractionEngine}
 						on:change={(e) => {
 							showTikaServerUrl = e.target.value === 'tika';
+							showDocumentIntelligenceConfig = e.target.value === 'document_intelligence';
 						}}
 					>
 						<option value="">{$i18n.t('Default')} </option>
 						<option value="tika">{$i18n.t('Tika')}</option>
+						<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
 					</select>
 				</div>
 			</div>
@@ -587,6 +606,21 @@
 					</div>
 				</div>
 			{/if}
+
+			{#if showDocumentIntelligenceConfig}
+				<div class="my-0.5 flex gap-2 pr-2">
+					<input
+						class="flex-1 w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
+						placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
+						bind:value={documentIntelligenceEndpoint}
+					/>
+
+					<SensitiveInput
+						placeholder={$i18n.t('Enter Document Intelligence Key')}
+						bind:value={documentIntelligenceKey}
+					/>
+				</div>
+			{/if}
 		</div>
 
 		<hr class=" dark:border-gray-850" />