From e229d27734a4f478a593db97453809a4ef37fa98 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 30 Sep 2024 21:50:03 -0700 Subject: [PATCH] Unstructured UI (#2636) * checkpoint * k * k * need frontend * add api key check + ui component * add proper ports + icons + functions * k * k * k --------- Co-authored-by: pablodanswer --- backend/danswer/auth/users.py | 1 - backend/danswer/configs/app_configs.py | 1 + backend/danswer/configs/constants.py | 1 + backend/danswer/connectors/blob/connector.py | 2 +- .../connectors/confluence/connector.py | 6 +- .../danswer/connectors/dropbox/connector.py | 2 +- backend/danswer/connectors/file/connector.py | 5 +- .../connectors/google_drive/connector.py | 28 ++-- .../connectors/sharepoint/connector.py | 2 +- .../file_processing/extract_file_text.py | 32 ++-- .../danswer/file_processing/unstructured.py | 67 +++++++++ .../danswer/server/manage/search_settings.py | 28 +++- .../server/query_and_chat/chat_backend.py | 5 +- backend/requirements/default.txt | 12 +- .../confluence/test_confluence_basic.py | 12 +- .../document_processing/page.tsx | 138 ++++++++++++++++++ web/src/components/admin/ClientLayout.tsx | 11 +- web/src/components/icons/icons.tsx | 25 ++++ 18 files changed, 340 insertions(+), 38 deletions(-) create mode 100644 backend/danswer/file_processing/unstructured.py create mode 100644 web/src/app/admin/configuration/document_processing/page.tsx diff --git a/backend/danswer/auth/users.py b/backend/danswer/auth/users.py index ac02d1258..a583a9323 100644 --- a/backend/danswer/auth/users.py +++ b/backend/danswer/auth/users.py @@ -342,7 +342,6 @@ def get_database_strategy( strategy = DatabaseStrategy( access_token_db, lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS # type: ignore ) - return strategy diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index c943bc7f6..24ea43717 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -338,6 +338,7 @@ INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL", 0)) # exception without aborting the attempt. INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT", 0)) + ##### # Miscellaneous ##### diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 52314db92..a2b0f752f 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -48,6 +48,7 @@ UNNAMED_KEY_PLACEHOLDER = "Unnamed" # Key-Value store keys KV_REINDEX_KEY = "needs_reindexing" KV_SEARCH_SETTINGS = "search_settings" +KV_UNSTRUCTURED_API_KEY = "unstructured_api_key" KV_USER_STORE_KEY = "INVITED_USERS" KV_NO_AUTH_USER_PREFERENCES_KEY = "no_auth_user_preferences" KV_CRED_KEY = "credential_id_{}" diff --git a/backend/danswer/connectors/blob/connector.py b/backend/danswer/connectors/blob/connector.py index a664a3d76..1f030a756 100644 --- a/backend/danswer/connectors/blob/connector.py +++ b/backend/danswer/connectors/blob/connector.py @@ -194,8 +194,8 @@ class BlobStorageConnector(LoadConnector, PollConnector): try: text = extract_file_text( - name, BytesIO(downloaded_file), + file_name=name, break_on_unprocessable=False, ) batch.append( diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index d3caf66cc..f800aa495 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -519,7 +519,9 @@ class ConfluenceConnector(LoadConnector, PollConnector): return None extracted_text = extract_file_text( - attachment["title"], io.BytesIO(response.content), False + io.BytesIO(response.content), + file_name=attachment["title"], + break_on_unprocessable=False, ) if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD: logger.warning( @@ -625,7 +627,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): ) unused_attachments.extend(unused_page_attachments) - page_text += attachment_text + page_text += "\n" + attachment_text if attachment_text else "" comments_text = self._fetch_comments(self.confluence_client, page_id) page_text += comments_text doc_metadata: dict[str, str | list[str]] = {"Wiki Space Name": self.space} diff --git a/backend/danswer/connectors/dropbox/connector.py b/backend/danswer/connectors/dropbox/connector.py index b36f0fbd1..7d2eb0166 100644 --- a/backend/danswer/connectors/dropbox/connector.py +++ b/backend/danswer/connectors/dropbox/connector.py @@ -97,8 +97,8 @@ class DropboxConnector(LoadConnector, PollConnector): link = self._get_shared_link(entry.path_display) try: text = extract_file_text( - entry.name, BytesIO(downloaded_file), + file_name=entry.name, break_on_unprocessable=False, ) batch.append( diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 83d0af2c1..8ef98716c 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -74,13 +74,14 @@ def _process_file( ) # Using the PDF reader function directly to pass in password cleanly - elif extension == ".pdf": + elif extension == ".pdf" and pdf_pass is not None: file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass) else: file_content_raw = extract_file_text( - file_name=file_name, file=file, + file_name=file_name, + break_on_unprocessable=True, ) all_metadata = {**metadata, **file_metadata} if metadata else file_metadata diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index bf267ab77..48b514e80 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -36,6 +36,8 @@ from danswer.connectors.models import Section from danswer.file_processing.extract_file_text import docx_to_text from danswer.file_processing.extract_file_text import pptx_to_text from danswer.file_processing.extract_file_text import read_pdf_file +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import unstructured_to_text from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger @@ -327,16 +329,24 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: GDriveMimeType.MARKDOWN.value, ]: return service.files().get_media(fileId=file["id"]).execute().decode("utf-8") - elif mime_type == GDriveMimeType.WORD_DOC.value: + if mime_type in [ + GDriveMimeType.WORD_DOC.value, + GDriveMimeType.POWERPOINT.value, + GDriveMimeType.PDF.value, + ]: response = service.files().get_media(fileId=file["id"]).execute() - return docx_to_text(file=io.BytesIO(response)) - elif mime_type == GDriveMimeType.PDF.value: - response = service.files().get_media(fileId=file["id"]).execute() - text, _ = read_pdf_file(file=io.BytesIO(response)) - return text - elif mime_type == GDriveMimeType.POWERPOINT.value: - response = service.files().get_media(fileId=file["id"]).execute() - return pptx_to_text(file=io.BytesIO(response)) + if get_unstructured_api_key(): + return unstructured_to_text( + file=io.BytesIO(response), file_name=file.get("name", file["id"]) + ) + + if mime_type == GDriveMimeType.WORD_DOC.value: + return docx_to_text(file=io.BytesIO(response)) + elif mime_type == GDriveMimeType.PDF.value: + text, _ = read_pdf_file(file=io.BytesIO(response)) + return text + elif mime_type == GDriveMimeType.POWERPOINT.value: + return pptx_to_text(file=io.BytesIO(response)) return UNSUPPORTED_FILE_TYPE_CONTENT diff --git a/backend/danswer/connectors/sharepoint/connector.py b/backend/danswer/connectors/sharepoint/connector.py index e74dcbf7e..a32b4ecb1 100644 --- a/backend/danswer/connectors/sharepoint/connector.py +++ b/backend/danswer/connectors/sharepoint/connector.py @@ -40,8 +40,8 @@ def _convert_driveitem_to_document( driveitem: DriveItem, ) -> Document: file_text = extract_file_text( - file_name=driveitem.name, file=io.BytesIO(driveitem.get_content().execute_query().value), + file_name=driveitem.name, break_on_unprocessable=False, ) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 36df08ac4..0f8c4e782 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -20,6 +20,8 @@ from pypdf.errors import PdfStreamError from danswer.configs.constants import DANSWER_METADATA_FILENAME from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import unstructured_to_text from danswer.utils.logger import setup_logger logger = setup_logger() @@ -331,9 +333,10 @@ def file_io_to_text(file: IO[Any]) -> str: def extract_file_text( - file_name: str | None, file: IO[Any], + file_name: str, break_on_unprocessable: bool = True, + extension: str | None = None, ) -> str: extension_to_function: dict[str, Callable[[IO[Any]], str]] = { ".pdf": pdf_to_text, @@ -345,22 +348,29 @@ def extract_file_text( ".html": parse_html_page_basic, } - def _process_file() -> str: - if file_name: - extension = get_file_ext(file_name) - if check_file_ext_is_valid(extension): - return extension_to_function.get(extension, file_io_to_text)(file) + try: + if get_unstructured_api_key(): + return unstructured_to_text(file, file_name) - # Either the file somehow has no name or the extension is not one that we are familiar with + if file_name or extension: + if extension is not None: + final_extension = extension + elif file_name is not None: + final_extension = get_file_ext(file_name) + + if check_file_ext_is_valid(final_extension): + return extension_to_function.get(final_extension, file_io_to_text)(file) + + # Either the file somehow has no name or the extension is not one that we recognize if is_text_file(file): return file_io_to_text(file) raise ValueError("Unknown file extension and unknown text encoding") - try: - return _process_file() except Exception as e: if break_on_unprocessable: - raise RuntimeError(f"Failed to process file: {str(e)}") from e - logger.warning(f"Failed to process file: {str(e)}") + raise RuntimeError( + f"Failed to process file {file_name or 'Unknown'}: {str(e)}" + ) from e + logger.warning(f"Failed to process file {file_name or 'Unknown'}: {str(e)}") return "" diff --git a/backend/danswer/file_processing/unstructured.py b/backend/danswer/file_processing/unstructured.py new file mode 100644 index 000000000..c5a14d876 --- /dev/null +++ b/backend/danswer/file_processing/unstructured.py @@ -0,0 +1,67 @@ +from typing import Any +from typing import cast +from typing import IO + +from unstructured.staging.base import dict_to_elements +from unstructured_client import UnstructuredClient # type: ignore +from unstructured_client.models import operations # type: ignore +from unstructured_client.models import shared + +from danswer.configs.constants import KV_UNSTRUCTURED_API_KEY +from danswer.dynamic_configs.factory import get_dynamic_config_store +from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + + +def get_unstructured_api_key() -> str | None: + kv_store = get_dynamic_config_store() + try: + return cast(str, kv_store.load(KV_UNSTRUCTURED_API_KEY)) + except ConfigNotFoundError: + return None + + +def update_unstructured_api_key(api_key: str) -> None: + kv_store = get_dynamic_config_store() + kv_store.store(KV_UNSTRUCTURED_API_KEY, api_key) + + +def delete_unstructured_api_key() -> None: + kv_store = get_dynamic_config_store() + kv_store.delete(KV_UNSTRUCTURED_API_KEY) + + +def _sdk_partition_request( + file: IO[Any], file_name: str, **kwargs: Any +) -> operations.PartitionRequest: + try: + request = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=shared.Files(content=file.read(), file_name=file_name), + **kwargs, + ), + ) + return request + except Exception as e: + logger.error(f"Error creating partition request for file {file_name}: {str(e)}") + raise + + +def unstructured_to_text(file: IO[Any], file_name: str) -> str: + logger.debug(f"Starting to read file: {file_name}") + req = _sdk_partition_request(file, file_name, strategy="auto") + + unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key()) + + response = unstructured_client.general.partition(req) # type: ignore + elements = dict_to_elements(response.elements) + + if response.status_code != 200: + err = f"Received unexpected status code {response.status_code} from Unstructured API." + logger.error(err) + raise ValueError(err) + + return "\n\n".join(str(el) for el in elements) diff --git a/backend/danswer/server/manage/search_settings.py b/backend/danswer/server/manage/search_settings.py index c8433467f..6436a0bd8 100644 --- a/backend/danswer/server/manage/search_settings.py +++ b/backend/danswer/server/manage/search_settings.py @@ -21,6 +21,9 @@ from danswer.db.search_settings import get_secondary_search_settings from danswer.db.search_settings import update_current_search_settings from danswer.db.search_settings import update_search_settings_status from danswer.document_index.factory import get_default_document_index +from danswer.file_processing.unstructured import delete_unstructured_api_key +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import update_unstructured_api_key from danswer.natural_language_processing.search_nlp_models import clean_model_name from danswer.search.models import SavedSearchSettings from danswer.search.models import SearchSettingsCreationRequest @@ -30,7 +33,6 @@ from danswer.server.models import IdReturn from danswer.utils.logger import setup_logger from shared_configs.configs import ALT_INDEX_SUFFIX - router = APIRouter(prefix="/search-settings") logger = setup_logger() @@ -196,3 +198,27 @@ def update_saved_search_settings( update_current_search_settings( search_settings=search_settings, db_session=db_session ) + + +@router.get("/unstructured-api-key-set") +def unstructured_api_key_set( + _: User | None = Depends(current_admin_user), +) -> bool: + api_key = get_unstructured_api_key() + print(api_key) + return api_key is not None + + +@router.put("/upsert-unstructured-api-key") +def upsert_unstructured_api_key( + unstructured_api_key: str, + _: User | None = Depends(current_admin_user), +) -> None: + update_unstructured_api_key(unstructured_api_key) + + +@router.delete("/delete-unstructured-api-key") +def delete_unstructured_api_key_endpoint( + _: User | None = Depends(current_admin_user), +) -> None: + delete_unstructured_api_key() diff --git a/backend/danswer/server/query_and_chat/chat_backend.py b/backend/danswer/server/query_and_chat/chat_backend.py index c7f598341..36a09afde 100644 --- a/backend/danswer/server/query_and_chat/chat_backend.py +++ b/backend/danswer/server/query_and_chat/chat_backend.py @@ -588,7 +588,10 @@ def upload_files_for_chat( # if the file is a doc, extract text and store that so we don't need # to re-extract it every time we send a message if file_type == ChatFileType.DOC: - extracted_text = extract_file_text(file_name=file.filename, file=file.file) + extracted_text = extract_file_text( + file=file.file, + file_name=file.filename or "", + ) text_file_id = str(uuid.uuid4()) file_store.save_file( file_name=text_file_id, diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 1a4931008..9855b8662 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -2,7 +2,7 @@ aiohttp==3.10.2 alembic==1.10.4 asyncpg==0.27.0 atlassian-python-api==3.37.0 -beautifulsoup4==4.12.2 +beautifulsoup4==4.12.3 boto3==1.34.84 celery==5.3.4 chardet==5.2.0 @@ -19,9 +19,9 @@ google-auth-oauthlib==1.0.0 # GPT4All library has issues running on Macs and python:3.11.4-slim-bookworm # will reintroduce this when library version catches up # gpt4all==2.0.2 -httpcore==0.16.3 -httpx[http2]==0.23.3 -httpx-oauth==0.11.2 +httpcore==1.0.5 +httpx[http2]==0.27.0 +httpx-oauth==0.15.1 huggingface-hub==0.20.1 jira==3.5.1 jsonref==1.1.0 @@ -46,7 +46,7 @@ PyGithub==1.58.2 python-dateutil==2.8.2 python-gitlab==3.9.0 python-pptx==0.6.23 -pypdf==3.17.0 +pypdf==4.3.0 pytest-mock==3.12.0 pytest-playwright==0.3.2 python-docx==1.1.2 @@ -67,6 +67,8 @@ supervisor==4.2.5 tiktoken==0.7.0 timeago==1.0.16 transformers==4.39.2 +unstructured==0.15.1 +unstructured-client==0.25.4 uvicorn==0.21.1 zulip==0.8.2 hubspot-api-client==8.1.0 diff --git a/backend/tests/daily/connectors/confluence/test_confluence_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_basic.py index 4eb252078..a791b1eab 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_basic.py @@ -1,5 +1,7 @@ import os import time +from unittest.mock import MagicMock +from unittest.mock import patch import pytest @@ -24,7 +26,13 @@ def confluence_connector() -> ConfluenceConnector: return connector -def test_confluence_connector_basic(confluence_connector: ConfluenceConnector) -> None: +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_confluence_connector_basic( + mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector +) -> None: doc_batch_generator = confluence_connector.poll_source(0, time.time()) doc_batch = next(doc_batch_generator) @@ -41,7 +49,7 @@ def test_confluence_connector_basic(confluence_connector: ConfluenceConnector) - assert len(doc.sections) == 1 section = doc.sections[0] - assert section.text == "test123small" + assert section.text == "test123\nsmall" assert ( section.link == "https://danswerai.atlassian.net/wiki/spaces/DailyConne/overview" diff --git a/web/src/app/admin/configuration/document_processing/page.tsx b/web/src/app/admin/configuration/document_processing/page.tsx new file mode 100644 index 000000000..9ccd72b73 --- /dev/null +++ b/web/src/app/admin/configuration/document_processing/page.tsx @@ -0,0 +1,138 @@ +"use client"; + +import { useState } from "react"; +import { Button, Card } from "@tremor/react"; +import { DocumentIcon2 } from "@/components/icons/icons"; +import useSWR from "swr"; +import { ThreeDotsLoader } from "@/components/Loading"; +import { AdminPageTitle } from "@/components/admin/Title"; +import { Lock } from "@phosphor-icons/react"; + +function Main() { + const { + data: isApiKeySet, + error, + mutate, + isLoading, + } = useSWR<{ + unstructured_api_key: string | null; + }>("/api/search-settings/unstructured-api-key-set", (url: string) => + fetch(url).then((res) => res.json()) + ); + + const [apiKey, setApiKey] = useState(""); + + const handleSave = async () => { + try { + await fetch( + `/api/search-settings/upsert-unstructured-api-key?unstructured_api_key=${apiKey}`, + { + method: "PUT", + } + ); + } catch (error) { + console.error("Failed to save API key:", error); + } + mutate(); + }; + + const handleDelete = async () => { + try { + await fetch("/api/search-settings/delete-unstructured-api-key", { + method: "DELETE", + }); + setApiKey(""); + } catch (error) { + console.error("Failed to delete API key:", error); + } + mutate(); + }; + + if (isLoading) { + return ; + } + return ( +
+ +

+ Unstructured API Integration +

+ +
+

+ Unstructured effortlessly extracts and transforms complex data from + difficult-to-use formats like HTML, PDF, CSV, PNG, PPTX, and more. + Enter an API key to enable this powerful document processing. If not + set, standard document processing will be used. +

+

+ Learn more about Unstructured{" "} + + here + + . +

+
+ {isApiKeySet ? ( +
+ •••••••••••••••• + +
+ ) : ( + setApiKey(e.target.value)} + className="w-full p-3 border rounded-md bg-background text-text focus:ring-2 focus:ring-blue-500 transition duration-200" + /> + )} +
+
+ {isApiKeySet ? ( + <> + +

+ Delete the current API key before updating. +

+ + ) : ( + + )} +
+
+
+
+ ); +} + +function Page() { + return ( +
+ } + /> +
+
+ ); +} + +export default Page; diff --git a/web/src/components/admin/ClientLayout.tsx b/web/src/components/admin/ClientLayout.tsx index 961f5a9c2..e0415f845 100644 --- a/web/src/components/admin/ClientLayout.tsx +++ b/web/src/components/admin/ClientLayout.tsx @@ -21,6 +21,7 @@ import { AssistantsIconSkeleton, ClosedBookIcon, SearchIcon, + DocumentIcon2, } from "@/components/icons/icons"; import { UserRole } from "@/lib/types"; import { FiActivity, FiBarChart2 } from "react-icons/fi"; @@ -29,7 +30,6 @@ import { User } from "@/lib/types"; import { usePathname } from "next/navigation"; import { SettingsContext } from "../settings/SettingsProvider"; import { useContext } from "react"; -import { CustomTooltip } from "../tooltip/CustomTooltip"; export function ClientLayout({ user, @@ -246,6 +246,15 @@ export function ClientLayout({ ), link: "/admin/configuration/search", }, + { + name: ( +
+ +
Document Processing
+
+ ), + link: "/admin/configuration/document-processing", + }, ], }, { diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index 1f6f1f2e8..186e4473c 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -2791,6 +2791,31 @@ export const MacIcon = ({ ); }; +export const DocumentIcon2 = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + return ( + + + + ); +}; + export const WindowsIcon = ({ size = 16, className = "my-auto flex flex-shrink-0 ",