feat: Implement Document Intelligence as Content Extraction Engine

This commit is contained in:
Mazurek Michal 2025-02-07 13:44:47 +01:00
parent e9d6ada25c
commit 35f3824932
8 changed files with 107 additions and 2 deletions

View File

@ -1431,6 +1431,18 @@ TIKA_SERVER_URL = PersistentConfig(
os.getenv("TIKA_SERVER_URL", "http://tika:9998"), # Default for sidecar deployment
)
DOCUMENT_INTELLIGENCE_ENDPOINT = PersistentConfig(
"DOCUMENT_INTELLIGENCE_ENDPOINT",
"rag.document_intelligence_endpoint",
os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT", ""),
)
DOCUMENT_INTELLIGENCE_KEY = PersistentConfig(
"DOCUMENT_INTELLIGENCE_KEY",
"rag.document_intelligence_key",
os.getenv("DOCUMENT_INTELLIGENCE_KEY", ""),
)
RAG_TOP_K = PersistentConfig(
"RAG_TOP_K", "rag.top_k", int(os.environ.get("RAG_TOP_K", "3"))
)

View File

@ -154,6 +154,8 @@ from open_webui.config import (
CHUNK_SIZE,
CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL,
DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY,
RAG_TOP_K,
RAG_TEXT_SPLITTER,
TIKTOKEN_ENCODING_NAME,
@ -478,6 +480,8 @@ app.state.config.ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION = (
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL
app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = DOCUMENT_INTELLIGENCE_ENDPOINT
app.state.config.DOCUMENT_INTELLIGENCE_KEY = DOCUMENT_INTELLIGENCE_KEY
app.state.config.TEXT_SPLITTER = RAG_TEXT_SPLITTER
app.state.config.TIKTOKEN_ENCODING_NAME = TIKTOKEN_ENCODING_NAME

View File

@ -4,6 +4,7 @@ import ftfy
import sys
from langchain_community.document_loaders import (
AzureAIDocumentIntelligenceLoader,
BSHTMLLoader,
CSVLoader,
Docx2txtLoader,
@ -147,6 +148,27 @@ class Loader:
file_path=file_path,
mime_type=file_content_type,
)
elif (
self.engine == "document_intelligence"
and self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT") != ""
and self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY") != ""
and (
file_ext in ["pdf", "xls", "xlsx", "docx", "ppt", "pptx"]
or file_content_type
in [
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]
)
):
loader = AzureAIDocumentIntelligenceLoader(
file_path=file_path,
api_endpoint=self.kwargs.get("DOCUMENT_INTELLIGENCE_ENDPOINT"),
api_key=self.kwargs.get("DOCUMENT_INTELLIGENCE_KEY"),
)
else:
if file_ext == "pdf":
loader = PyPDFLoader(

View File

@ -352,6 +352,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
},
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@ -402,9 +406,15 @@ class FileConfig(BaseModel):
max_count: Optional[int] = None
class DocumentIntelligenceConfigForm(BaseModel):
endpoint: str
key: str
class ContentExtractionConfig(BaseModel):
engine: str = ""
tika_server_url: Optional[str] = None
document_intelligence_config: Optional[DocumentIntelligenceConfigForm] = None
class ChunkParamUpdateForm(BaseModel):
@ -479,13 +489,22 @@ async def update_rag_config(
request.app.state.config.FILE_MAX_COUNT = form_data.file.max_count
if form_data.content_extraction is not None:
log.info(f"Updating text settings: {form_data.content_extraction}")
log.info(
f"Updating content extraction: {request.app.state.config.CONTENT_EXTRACTION_ENGINE} to {form_data.content_extraction.engine}"
)
request.app.state.config.CONTENT_EXTRACTION_ENGINE = (
form_data.content_extraction.engine
)
request.app.state.config.TIKA_SERVER_URL = (
form_data.content_extraction.tika_server_url
)
if form_data.content_extraction.document_intelligence_config is not None:
request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT = (
form_data.content_extraction.document_intelligence_config.endpoint
)
request.app.state.config.DOCUMENT_INTELLIGENCE_KEY = (
form_data.content_extraction.document_intelligence_config.key
)
if form_data.chunk is not None:
request.app.state.config.TEXT_SPLITTER = form_data.chunk.text_splitter
@ -564,6 +583,10 @@ async def update_rag_config(
"content_extraction": {
"engine": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"tika_server_url": request.app.state.config.TIKA_SERVER_URL,
"document_intelligence_config": {
"endpoint": request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
"key": request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
},
},
"chunk": {
"text_splitter": request.app.state.config.TEXT_SPLITTER,
@ -887,6 +910,8 @@ def process_file(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL,
PDF_EXTRACT_IMAGES=request.app.state.config.PDF_EXTRACT_IMAGES,
DOCUMENT_INTELLIGENCE_ENDPOINT=request.app.state.config.DOCUMENT_INTELLIGENCE_ENDPOINT,
DOCUMENT_INTELLIGENCE_KEY=request.app.state.config.DOCUMENT_INTELLIGENCE_KEY,
)
docs = loader.load(
file.filename, file.meta.get("content_type"), file_path

View File

@ -72,6 +72,7 @@ validators==0.34.0
psutil
sentencepiece
soundfile==0.13.1
azure-ai-documentintelligence==1.0.0
opencv-python-headless==4.11.0.86
rapidocr-onnxruntime==1.3.24

View File

@ -77,6 +77,7 @@ dependencies = [
"psutil",
"sentencepiece",
"soundfile==0.13.1",
"azure-ai-documentintelligence==1.0.0",
"opencv-python-headless==4.11.0.86",
"rapidocr-onnxruntime==1.3.24",

View File

@ -32,9 +32,15 @@ type ChunkConfigForm = {
chunk_overlap: number;
};
type DocumentIntelligenceConfigForm = {
key: string;
endpoint: string;
};
type ContentExtractConfigForm = {
engine: string;
tika_server_url: string | null;
document_intelligence_config: DocumentIntelligenceConfigForm | null;
};
type YoutubeConfigForm = {

View File

@ -50,6 +50,9 @@
let contentExtractionEngine = 'default';
let tikaServerUrl = '';
let showTikaServerUrl = false;
let documentIntelligenceEndpoint = '';
let documentIntelligenceKey = '';
let showDocumentIntelligenceConfig = false;
let textSplitter = '';
let chunkSize = 0;
@ -175,6 +178,13 @@
toast.error($i18n.t('Tika Server URL required.'));
return;
}
if (
contentExtractionEngine === 'document_intelligence' &&
(documentIntelligenceEndpoint === '' || documentIntelligenceKey === '')
) {
toast.error($i18n.t('Document Intelligence endpoint and key required.'));
return;
}
const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
enable_google_drive_integration: enableGoogleDriveIntegration,
@ -189,7 +199,11 @@
},
content_extraction: {
engine: contentExtractionEngine,
tika_server_url: tikaServerUrl
tika_server_url: tikaServerUrl,
document_intelligence_config: {
key: documentIntelligenceKey,
endpoint: documentIntelligenceEndpoint
}
}
});
@ -245,6 +259,9 @@
contentExtractionEngine = res.content_extraction.engine;
tikaServerUrl = res.content_extraction.tika_server_url;
showTikaServerUrl = contentExtractionEngine === 'tika';
documentIntelligenceEndpoint = res.content_extraction.document_intelligence_config.endpoint;
documentIntelligenceKey = res.content_extraction.document_intelligence_config.key;
showDocumentIntelligenceConfig = contentExtractionEngine === 'document_intelligence';
fileMaxSize = res?.file.max_size ?? '';
fileMaxCount = res?.file.max_count ?? '';
@ -568,10 +585,12 @@
bind:value={contentExtractionEngine}
on:change={(e) => {
showTikaServerUrl = e.target.value === 'tika';
showDocumentIntelligenceConfig = e.target.value === 'document_intelligence';
}}
>
<option value="">{$i18n.t('Default')} </option>
<option value="tika">{$i18n.t('Tika')}</option>
<option value="document_intelligence">{$i18n.t('Document Intelligence')}</option>
</select>
</div>
</div>
@ -587,6 +606,21 @@
</div>
</div>
{/if}
{#if showDocumentIntelligenceConfig}
<div class="my-0.5 flex gap-2 pr-2">
<input
class="flex-1 w-full rounded-lg py-2 px-4 text-sm bg-gray-50 dark:text-gray-300 dark:bg-gray-850 outline-none"
placeholder={$i18n.t('Enter Document Intelligence Endpoint')}
bind:value={documentIntelligenceEndpoint}
/>
<SensitiveInput
placeholder={$i18n.t('Enter Document Intelligence Key')}
bind:value={documentIntelligenceKey}
/>
</div>
{/if}
</div>
<hr class=" dark:border-gray-850" />