Unstructured UI (#2636)

* checkpoint

* k

* k

* need frontend

* add api key check + ui component

* add proper ports + icons + functions

* k

* k

* k

---------

Co-authored-by: pablodanswer <pablo@danswer.ai>
This commit is contained in:
Yuhong Sun 2024-09-30 21:50:03 -07:00 committed by GitHub
parent 140c5b3957
commit e229d27734
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 340 additions and 38 deletions

View File

@ -342,7 +342,6 @@ def get_database_strategy(
strategy = DatabaseStrategy(
access_token_db, lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS # type: ignore
)
return strategy

View File

@ -338,6 +338,7 @@ INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL", 0))
# exception without aborting the attempt.
INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT", 0))
#####
# Miscellaneous
#####

View File

@ -48,6 +48,7 @@ UNNAMED_KEY_PLACEHOLDER = "Unnamed"
# Key-Value store keys
KV_REINDEX_KEY = "needs_reindexing"
KV_SEARCH_SETTINGS = "search_settings"
KV_UNSTRUCTURED_API_KEY = "unstructured_api_key"
KV_USER_STORE_KEY = "INVITED_USERS"
KV_NO_AUTH_USER_PREFERENCES_KEY = "no_auth_user_preferences"
KV_CRED_KEY = "credential_id_{}"

View File

@ -194,8 +194,8 @@ class BlobStorageConnector(LoadConnector, PollConnector):
try:
text = extract_file_text(
name,
BytesIO(downloaded_file),
file_name=name,
break_on_unprocessable=False,
)
batch.append(

View File

@ -519,7 +519,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
return None
extracted_text = extract_file_text(
attachment["title"], io.BytesIO(response.content), False
io.BytesIO(response.content),
file_name=attachment["title"],
break_on_unprocessable=False,
)
if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
logger.warning(
@ -625,7 +627,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
)
unused_attachments.extend(unused_page_attachments)
page_text += attachment_text
page_text += "\n" + attachment_text if attachment_text else ""
comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text
doc_metadata: dict[str, str | list[str]] = {"Wiki Space Name": self.space}

View File

@ -97,8 +97,8 @@ class DropboxConnector(LoadConnector, PollConnector):
link = self._get_shared_link(entry.path_display)
try:
text = extract_file_text(
entry.name,
BytesIO(downloaded_file),
file_name=entry.name,
break_on_unprocessable=False,
)
batch.append(

View File

@ -74,13 +74,14 @@ def _process_file(
)
# Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf":
elif extension == ".pdf" and pdf_pass is not None:
file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
else:
file_content_raw = extract_file_text(
file_name=file_name,
file=file,
file_name=file_name,
break_on_unprocessable=True,
)
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata

View File

@ -36,6 +36,8 @@ from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.file_processing.unstructured import get_unstructured_api_key
from danswer.file_processing.unstructured import unstructured_to_text
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
@ -327,16 +329,24 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
GDriveMimeType.MARKDOWN.value,
]:
return service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
elif mime_type == GDriveMimeType.WORD_DOC.value:
if mime_type in [
GDriveMimeType.WORD_DOC.value,
GDriveMimeType.POWERPOINT.value,
GDriveMimeType.PDF.value,
]:
response = service.files().get_media(fileId=file["id"]).execute()
return docx_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
text, _ = read_pdf_file(file=io.BytesIO(response))
return text
elif mime_type == GDriveMimeType.POWERPOINT.value:
response = service.files().get_media(fileId=file["id"]).execute()
return pptx_to_text(file=io.BytesIO(response))
if get_unstructured_api_key():
return unstructured_to_text(
file=io.BytesIO(response), file_name=file.get("name", file["id"])
)
if mime_type == GDriveMimeType.WORD_DOC.value:
return docx_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.PDF.value:
text, _ = read_pdf_file(file=io.BytesIO(response))
return text
elif mime_type == GDriveMimeType.POWERPOINT.value:
return pptx_to_text(file=io.BytesIO(response))
return UNSUPPORTED_FILE_TYPE_CONTENT

View File

@ -40,8 +40,8 @@ def _convert_driveitem_to_document(
driveitem: DriveItem,
) -> Document:
file_text = extract_file_text(
file_name=driveitem.name,
file=io.BytesIO(driveitem.get_content().execute_query().value),
file_name=driveitem.name,
break_on_unprocessable=False,
)

View File

@ -20,6 +20,8 @@ from pypdf.errors import PdfStreamError
from danswer.configs.constants import DANSWER_METADATA_FILENAME
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.file_processing.unstructured import get_unstructured_api_key
from danswer.file_processing.unstructured import unstructured_to_text
from danswer.utils.logger import setup_logger
logger = setup_logger()
@ -331,9 +333,10 @@ def file_io_to_text(file: IO[Any]) -> str:
def extract_file_text(
file_name: str | None,
file: IO[Any],
file_name: str,
break_on_unprocessable: bool = True,
extension: str | None = None,
) -> str:
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text,
@ -345,22 +348,29 @@ def extract_file_text(
".html": parse_html_page_basic,
}
def _process_file() -> str:
if file_name:
extension = get_file_ext(file_name)
if check_file_ext_is_valid(extension):
return extension_to_function.get(extension, file_io_to_text)(file)
try:
if get_unstructured_api_key():
return unstructured_to_text(file, file_name)
# Either the file somehow has no name or the extension is not one that we are familiar with
if file_name or extension:
if extension is not None:
final_extension = extension
elif file_name is not None:
final_extension = get_file_ext(file_name)
if check_file_ext_is_valid(final_extension):
return extension_to_function.get(final_extension, file_io_to_text)(file)
# Either the file somehow has no name or the extension is not one that we recognize
if is_text_file(file):
return file_io_to_text(file)
raise ValueError("Unknown file extension and unknown text encoding")
try:
return _process_file()
except Exception as e:
if break_on_unprocessable:
raise RuntimeError(f"Failed to process file: {str(e)}") from e
logger.warning(f"Failed to process file: {str(e)}")
raise RuntimeError(
f"Failed to process file {file_name or 'Unknown'}: {str(e)}"
) from e
logger.warning(f"Failed to process file {file_name or 'Unknown'}: {str(e)}")
return ""

View File

@ -0,0 +1,67 @@
from typing import Any
from typing import cast
from typing import IO
from unstructured.staging.base import dict_to_elements
from unstructured_client import UnstructuredClient # type: ignore
from unstructured_client.models import operations # type: ignore
from unstructured_client.models import shared
from danswer.configs.constants import KV_UNSTRUCTURED_API_KEY
from danswer.dynamic_configs.factory import get_dynamic_config_store
from danswer.dynamic_configs.interface import ConfigNotFoundError
from danswer.utils.logger import setup_logger
logger = setup_logger()
def get_unstructured_api_key() -> str | None:
kv_store = get_dynamic_config_store()
try:
return cast(str, kv_store.load(KV_UNSTRUCTURED_API_KEY))
except ConfigNotFoundError:
return None
def update_unstructured_api_key(api_key: str) -> None:
kv_store = get_dynamic_config_store()
kv_store.store(KV_UNSTRUCTURED_API_KEY, api_key)
def delete_unstructured_api_key() -> None:
kv_store = get_dynamic_config_store()
kv_store.delete(KV_UNSTRUCTURED_API_KEY)
def _sdk_partition_request(
file: IO[Any], file_name: str, **kwargs: Any
) -> operations.PartitionRequest:
try:
request = operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=shared.Files(content=file.read(), file_name=file_name),
**kwargs,
),
)
return request
except Exception as e:
logger.error(f"Error creating partition request for file {file_name}: {str(e)}")
raise
def unstructured_to_text(file: IO[Any], file_name: str) -> str:
logger.debug(f"Starting to read file: {file_name}")
req = _sdk_partition_request(file, file_name, strategy="auto")
unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key())
response = unstructured_client.general.partition(req) # type: ignore
elements = dict_to_elements(response.elements)
if response.status_code != 200:
err = f"Received unexpected status code {response.status_code} from Unstructured API."
logger.error(err)
raise ValueError(err)
return "\n\n".join(str(el) for el in elements)

View File

@ -21,6 +21,9 @@ from danswer.db.search_settings import get_secondary_search_settings
from danswer.db.search_settings import update_current_search_settings
from danswer.db.search_settings import update_search_settings_status
from danswer.document_index.factory import get_default_document_index
from danswer.file_processing.unstructured import delete_unstructured_api_key
from danswer.file_processing.unstructured import get_unstructured_api_key
from danswer.file_processing.unstructured import update_unstructured_api_key
from danswer.natural_language_processing.search_nlp_models import clean_model_name
from danswer.search.models import SavedSearchSettings
from danswer.search.models import SearchSettingsCreationRequest
@ -30,7 +33,6 @@ from danswer.server.models import IdReturn
from danswer.utils.logger import setup_logger
from shared_configs.configs import ALT_INDEX_SUFFIX
router = APIRouter(prefix="/search-settings")
logger = setup_logger()
@ -196,3 +198,27 @@ def update_saved_search_settings(
update_current_search_settings(
search_settings=search_settings, db_session=db_session
)
@router.get("/unstructured-api-key-set")
def unstructured_api_key_set(
_: User | None = Depends(current_admin_user),
) -> bool:
api_key = get_unstructured_api_key()
print(api_key)
return api_key is not None
@router.put("/upsert-unstructured-api-key")
def upsert_unstructured_api_key(
unstructured_api_key: str,
_: User | None = Depends(current_admin_user),
) -> None:
update_unstructured_api_key(unstructured_api_key)
@router.delete("/delete-unstructured-api-key")
def delete_unstructured_api_key_endpoint(
_: User | None = Depends(current_admin_user),
) -> None:
delete_unstructured_api_key()

View File

@ -588,7 +588,10 @@ def upload_files_for_chat(
# if the file is a doc, extract text and store that so we don't need
# to re-extract it every time we send a message
if file_type == ChatFileType.DOC:
extracted_text = extract_file_text(file_name=file.filename, file=file.file)
extracted_text = extract_file_text(
file=file.file,
file_name=file.filename or "",
)
text_file_id = str(uuid.uuid4())
file_store.save_file(
file_name=text_file_id,

View File

@ -2,7 +2,7 @@ aiohttp==3.10.2
alembic==1.10.4
asyncpg==0.27.0
atlassian-python-api==3.37.0
beautifulsoup4==4.12.2
beautifulsoup4==4.12.3
boto3==1.34.84
celery==5.3.4
chardet==5.2.0
@ -19,9 +19,9 @@ google-auth-oauthlib==1.0.0
# GPT4All library has issues running on Macs and python:3.11.4-slim-bookworm
# will reintroduce this when library version catches up
# gpt4all==2.0.2
httpcore==0.16.3
httpx[http2]==0.23.3
httpx-oauth==0.11.2
httpcore==1.0.5
httpx[http2]==0.27.0
httpx-oauth==0.15.1
huggingface-hub==0.20.1
jira==3.5.1
jsonref==1.1.0
@ -46,7 +46,7 @@ PyGithub==1.58.2
python-dateutil==2.8.2
python-gitlab==3.9.0
python-pptx==0.6.23
pypdf==3.17.0
pypdf==4.3.0
pytest-mock==3.12.0
pytest-playwright==0.3.2
python-docx==1.1.2
@ -67,6 +67,8 @@ supervisor==4.2.5
tiktoken==0.7.0
timeago==1.0.16
transformers==4.39.2
unstructured==0.15.1
unstructured-client==0.25.4
uvicorn==0.21.1
zulip==0.8.2
hubspot-api-client==8.1.0

View File

@ -1,5 +1,7 @@
import os
import time
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
@ -24,7 +26,13 @@ def confluence_connector() -> ConfluenceConnector:
return connector
def test_confluence_connector_basic(confluence_connector: ConfluenceConnector) -> None:
@patch(
"danswer.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_confluence_connector_basic(
mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector
) -> None:
doc_batch_generator = confluence_connector.poll_source(0, time.time())
doc_batch = next(doc_batch_generator)
@ -41,7 +49,7 @@ def test_confluence_connector_basic(confluence_connector: ConfluenceConnector) -
assert len(doc.sections) == 1
section = doc.sections[0]
assert section.text == "test123small"
assert section.text == "test123\nsmall"
assert (
section.link
== "https://danswerai.atlassian.net/wiki/spaces/DailyConne/overview"

View File

@ -0,0 +1,138 @@
"use client";
import { useState } from "react";
import { Button, Card } from "@tremor/react";
import { DocumentIcon2 } from "@/components/icons/icons";
import useSWR from "swr";
import { ThreeDotsLoader } from "@/components/Loading";
import { AdminPageTitle } from "@/components/admin/Title";
import { Lock } from "@phosphor-icons/react";
function Main() {
const {
data: isApiKeySet,
error,
mutate,
isLoading,
} = useSWR<{
unstructured_api_key: string | null;
}>("/api/search-settings/unstructured-api-key-set", (url: string) =>
fetch(url).then((res) => res.json())
);
const [apiKey, setApiKey] = useState("");
const handleSave = async () => {
try {
await fetch(
`/api/search-settings/upsert-unstructured-api-key?unstructured_api_key=${apiKey}`,
{
method: "PUT",
}
);
} catch (error) {
console.error("Failed to save API key:", error);
}
mutate();
};
const handleDelete = async () => {
try {
await fetch("/api/search-settings/delete-unstructured-api-key", {
method: "DELETE",
});
setApiKey("");
} catch (error) {
console.error("Failed to delete API key:", error);
}
mutate();
};
if (isLoading) {
return <ThreeDotsLoader />;
}
return (
<div className="container mx-auto p-4">
<Card className="mb-8 max-w-2xl bg-white text-text shadow-lg rounded-lg">
<h3 className="text-2xl text-text-800 font-bold mb-4 text-text border-b border-b-border pb-2">
Unstructured API Integration
</h3>
<div className="space-y-4">
<p className="text-text-600">
Unstructured effortlessly extracts and transforms complex data from
difficult-to-use formats like HTML, PDF, CSV, PNG, PPTX, and more.
Enter an API key to enable this powerful document processing. If not
set, standard document processing will be used.
</p>
<p className="text-text-600">
Learn more about Unstructured{" "}
<a
href="https://unstructured.io/docs"
target="_blank"
rel="noopener noreferrer"
className="text-blue-500 hover:underline font-medium"
>
here
</a>
.
</p>
<div className="mt-4">
{isApiKeySet ? (
<div className="w-full p-3 border rounded-md bg-background text-text flex items-center">
<span className="flex-grow"></span>
<Lock className="h-5 w-5 text-gray-400" />
</div>
) : (
<input
type="text"
placeholder="Enter API Key"
value={apiKey}
onChange={(e) => setApiKey(e.target.value)}
className="w-full p-3 border rounded-md bg-background text-text focus:ring-2 focus:ring-blue-500 transition duration-200"
/>
)}
</div>
<div className="flex space-x-4 mt-6">
{isApiKeySet ? (
<>
<Button
color="red"
onClick={handleDelete}
variant="secondary"
className="bg-red-100 text-red-600 hover:bg-red-400 transition duration-200"
>
Delete API Key
</Button>
<p className="text-text-600 my-auto">
Delete the current API key before updating.
</p>
</>
) : (
<Button
onClick={handleSave}
className="bg-blue-500 text-white hover:bg-blue-600 transition duration-200"
>
Save API Key
</Button>
)}
</div>
</div>
</Card>
</div>
);
}
function Page() {
return (
<div className="mx-auto container">
<AdminPageTitle
title="Document Processing"
icon={<DocumentIcon2 size={32} className="my-auto" />}
/>
<Main />
</div>
);
}
export default Page;

View File

@ -21,6 +21,7 @@ import {
AssistantsIconSkeleton,
ClosedBookIcon,
SearchIcon,
DocumentIcon2,
} from "@/components/icons/icons";
import { UserRole } from "@/lib/types";
import { FiActivity, FiBarChart2 } from "react-icons/fi";
@ -29,7 +30,6 @@ import { User } from "@/lib/types";
import { usePathname } from "next/navigation";
import { SettingsContext } from "../settings/SettingsProvider";
import { useContext } from "react";
import { CustomTooltip } from "../tooltip/CustomTooltip";
export function ClientLayout({
user,
@ -246,6 +246,15 @@ export function ClientLayout({
),
link: "/admin/configuration/search",
},
{
name: (
<div className="flex">
<DocumentIcon2 className="text-icon-settings-sidebar" />
<div className="ml-1">Document Processing</div>
</div>
),
link: "/admin/configuration/document-processing",
},
],
},
{

View File

@ -2791,6 +2791,31 @@ export const MacIcon = ({
);
};
export const DocumentIcon2 = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => {
return (
<svg
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
xmlns="http://www.w3.org/2000/svg"
width="200"
height="200"
viewBox="0 0 24 24"
>
<path
fill="none"
stroke="currentColor"
strokeLinecap="round"
strokeLinejoin="round"
strokeWidth="1.5"
d="M19.5 14.25v-2.625a3.375 3.375 0 0 0-3.375-3.375h-1.5A1.125 1.125 0 0 1 13.5 7.125v-1.5a3.375 3.375 0 0 0-3.375-3.375H8.25m0 12.75h7.5m-7.5 3H12M10.5 2.25H5.625c-.621 0-1.125.504-1.125 1.125v17.25c0 .621.504 1.125 1.125 1.125h12.75c.621 0 1.125-.504 1.125-1.125V11.25a9 9 0 0 0-9-9Z"
/>
</svg>
);
};
export const WindowsIcon = ({
size = 16,
className = "my-auto flex flex-shrink-0 ",