From 35f3824932833fe77ef3bce54b86803cda4838a6 Mon Sep 17 00:00:00 2001 From: Mazurek Michal Date: Fri, 7 Feb 2025 13:44:47 +0100 Subject: [PATCH] feat: Implement Document Intelligence as Content Extraction Engine --- backend/open_webui/config.py | 12 +++++++ backend/open_webui/main.py | 4 +++ backend/open_webui/retrieval/loaders/main.py | 22 ++++++++++++ backend/open_webui/routers/retrieval.py | 27 +++++++++++++- backend/requirements.txt | 1 + pyproject.toml | 1 + src/lib/apis/retrieval/index.ts | 6 ++++ .../admin/Settings/Documents.svelte | 36 ++++++++++++++++++- 8 files changed, 107 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index bf6f1d025..e46a87cd5 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1431,6 +1431,18 @@ TIKA_SERVER_URL = PersistentConfig( os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment ) +DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig( + "DOCUMENT_INTELLIGENCE_ENDPOINT", + "rag.document_intelligence_endpoint", + os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""), +) + +DOCUMENT_INTELLIGENCE_KEY = PersistentConfig( + "DOCUMENT_INTELLIGENCE_KEY", + "rag.document_intelligence_key", + os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""), +) + RAG_TOP_K = PersistentConfig( "RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3")) ) diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 863f58dea..2f1b92b1d 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -154,6 +154,8 @@ from open_webui.config import ( CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL, + DOCUMENT_INTELLIGENCE_ENDPOINT, + DOCUMENT_INTELLIGENCE_KEY, RAG_TOP_K, RAG_TEXT_SPLITTER, TIKTOKEN_ENCODING_NAME, @@ -478,6 +480,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = ( app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL +app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT +app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index a9372f65a..19d590f5c 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -4,6 +4,7 @@ import ftfy import sys from langchain_community.document_loaders import ( + AzureAIDocumentIntelligenceLoader, BSHTMLLoader, CSVLoader, Docx2txtLoader, @@ -147,6 +148,27 @@ class Loader: file_path=file_path, mime_type=file_content_type, ) + elif ( + self.engine == "document_intelligence" + and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != "" + and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != "" + and ( + file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"] + or file_content_type + in [ + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ] + ) + ): + loader = AzureAIDocumentIntelligenceLoader( + file_path=file_path, + api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"), + api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"), + ) else: if file_ext == "pdf": loader = PyPDFLoader( diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 77f04a4be..4cfcd490d 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -352,6 +352,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "document_intelligence_config": { + "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + }, }, "chunk": { "text_splitter": request.app.state.config.TEXT_SPLITTER, @@ -402,9 +406,15 @@ class FileConfig(BaseModel): max_count: Optional[int] = None +class DocumentIntelligenceConfigForm(BaseModel): + endpoint: str + key: str + + class ContentExtractionConfig(BaseModel): engine: str = "" tika_server_url: Optional[str] = None + document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None class ChunkParamUpdateForm(BaseModel): @@ -479,13 +489,22 @@ async def update_rag_config( request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count if form_data.content_extraction is not None: - log.info(f"Updating text settings: {form_data.content_extraction}") + log.info( + f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}" + ) request.app.state.config.CONTENT_EXTRACTION_ENGINE = ( form_data.content_extraction.engine ) request.app.state.config.TIKA_SERVER_URL = ( form_data.content_extraction.tika_server_url ) + if form_data.content_extraction.document_intelligence_config is not None: + request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = ( + form_data.content_extraction.document_intelligence_config.endpoint + ) + request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = ( + form_data.content_extraction.document_intelligence_config.key + ) if form_data.chunk is not None: request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter @@ -564,6 +583,10 @@ async def update_rag_config( "content_extraction": { "engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "tika_server_url": request.app.state.config.TIKA_SERVER_URL, + "document_intelligence_config": { + "endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + "key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, + }, }, "chunk": { "text_splitter": request.app.state.config.TEXT_SPLITTER, @@ -887,6 +910,8 @@ def process_file( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES, + DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT, + DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY, ) docs = loader.load( file.filename, file.meta.get("content_type"), file_path diff --git a/backend/requirements.txt b/backend/requirements.txt index 14ad4b9cd..4a39e77b5 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -72,6 +72,7 @@ validators==0.34.0 psutil sentencepiece soundfile==0.13.1 +azure-ai-documentintelligence==1.0.0 opencv-python-headless==4.11.0.86 rapidocr-onnxruntime==1.3.24 diff --git a/pyproject.toml b/pyproject.toml index f121089e8..60d54afd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ dependencies = [ "psutil", "sentencepiece", "soundfile==0.13.1", + "azure-ai-documentintelligence==1.0.0", "opencv-python-headless==4.11.0.86", "rapidocr-onnxruntime==1.3.24", diff --git a/src/lib/apis/retrieval/index.ts b/src/lib/apis/retrieval/index.ts index c35c37847..ed07ab5d0 100644 --- a/src/lib/apis/retrieval/index.ts +++ b/src/lib/apis/retrieval/index.ts @@ -32,9 +32,15 @@ type ChunkConfigForm = { chunk_overlap: number; }; +type DocumentIntelligenceConfigForm = { + key: string; + endpoint: string; +}; + type ContentExtractConfigForm = { engine: string; tika_server_url: string | null; + document_intelligence_config: DocumentIntelligenceConfigForm | null; }; type YoutubeConfigForm = { diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index d3b7cfa01..e624a51b3 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -50,6 +50,9 @@ let contentExtractionEngine = 'default'; let tikaServerUrl = ''; let showTikaServerUrl = false; + let documentIntelligenceEndpoint = ''; + let documentIntelligenceKey = ''; + let showDocumentIntelligenceConfig = false; let textSplitter = ''; let chunkSize = 0; @@ -175,6 +178,13 @@ toast.error($i18n.t('Tika Server URL required.')); return; } + if ( + contentExtractionEngine === 'document_intelligence' && + (documentIntelligenceEndpoint === '' || documentIntelligenceKey === '') + ) { + toast.error($i18n.t('Document Intelligence endpoint and key required.')); + return; + } const res = await updateRAGConfig(localStorage.token, { pdf_extract_images: pdfExtractImages, enable_google_drive_integration: enableGoogleDriveIntegration, @@ -189,7 +199,11 @@ }, content_extraction: { engine: contentExtractionEngine, - tika_server_url: tikaServerUrl + tika_server_url: tikaServerUrl, + document_intelligence_config: { + key: documentIntelligenceKey, + endpoint: documentIntelligenceEndpoint + } } }); @@ -245,6 +259,9 @@ contentExtractionEngine = res.content_extraction.engine; tikaServerUrl = res.content_extraction.tika_server_url; showTikaServerUrl = contentExtractionEngine === 'tika'; + documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint; + documentIntelligenceKey = res.content_extraction.document_intelligence_config.key; + showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence'; fileMaxSize = res?.file.max_size ?? ''; fileMaxCount = res?.file.max_count ?? ''; @@ -568,10 +585,12 @@ bind:value={contentExtractionEngine} on:change={(e) => { showTikaServerUrl = e.target.value === 'tika'; + showDocumentIntelligenceConfig = e.target.value === 'document_intelligence'; }} > + @@ -587,6 +606,21 @@ {/if} + + {#if showDocumentIntelligenceConfig} +
+ + + +
+ {/if}