* checkpoint

* k

* k

* k

* fixed slack api calls

* missed one

---------

Co-authored-by: hagen-danswer <hagen@danswer.ai>
This commit is contained in:
Yuhong Sun 2024-10-24 16:45:48 -07:00 committed by GitHub
parent 9f50417109
commit b49a9ab171
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 11372 additions and 50 deletions

View File

@ -70,3 +70,12 @@ class DocumentAccess(ExternalAccess):
user_groups=set(user_groups),
is_public=is_public,
)
default_public_access = DocumentAccess(
external_user_emails=set(),
external_user_group_ids=set(),
user_emails=set(),
user_groups=set(),
is_public=True,
)

View File

@ -70,6 +70,7 @@ KV_CUSTOMER_UUID_KEY = "customer_uuid"
KV_INSTANCE_DOMAIN_KEY = "instance_domain"
KV_ENTERPRISE_SETTINGS_KEY = "danswer_enterprise_settings"
KV_CUSTOM_ANALYTICS_SCRIPT_KEY = "__custom_analytics_script__"
KV_DOCUMENTS_SEEDED_KEY = "documents_seeded"
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 60
CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120

View File

@ -15,7 +15,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
logger = setup_logger()

View File

@ -10,7 +10,6 @@ from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -19,6 +18,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.retry_wrapper import retry_builder
CLICKUP_API_BASE_URL = "https://api.clickup.com/api/v2"

View File

@ -14,7 +14,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
logger = setup_logger()

View File

@ -11,7 +11,6 @@ from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.document360.utils import flatten_child_categories
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -22,6 +21,7 @@ from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.utils.retry_wrapper import retry_builder
# Limitations and Potential Improvements
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in

View File

@ -19,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.google_drive.connector_auth import get_google_drive_creds
from danswer.connectors.google_drive.constants import (
DB_CREDENTIALS_DICT_DELEGATED_USER_KEY,
@ -40,6 +39,7 @@ from danswer.file_processing.unstructured import get_unstructured_api_key
from danswer.file_processing.unstructured import unstructured_to_text
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
logger = setup_logger()

View File

@ -10,9 +10,9 @@ from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from slack_sdk.web import SlackResponse
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.models import BasicExpertInfo
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
logger = setup_logger()

View File

@ -341,6 +341,8 @@ def add_credential_to_connector(
access_type: AccessType,
groups: list[int] | None,
auto_sync_options: dict | None = None,
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
last_successful_index_time: datetime | None = None,
) -> StatusResponse:
connector = fetch_connector_by_id(connector_id, db_session)
credential = fetch_credential_by_id(credential_id, user, db_session)
@ -384,9 +386,10 @@ def add_credential_to_connector(
connector_id=connector_id,
credential_id=credential_id,
name=cc_pair_name,
status=ConnectorCredentialPairStatus.ACTIVE,
status=initial_status,
access_type=access_type,
auto_sync_options=auto_sync_options,
last_successful_index_time=last_successful_index_time,
)
db_session.add(association)
db_session.flush() # make sure the association has an id

View File

@ -40,6 +40,8 @@ CREDENTIAL_PERMISSIONS_TO_IGNORE = {
DocumentSource.MEDIAWIKI,
}
PUBLIC_CREDENTIAL_ID = 0
def _add_user_filters(
stmt: Select,
@ -384,12 +386,11 @@ def delete_credential(
def create_initial_public_credential(db_session: Session) -> None:
public_cred_id = 0
error_msg = (
"DB is not in a valid initial state."
"There must exist an empty public credential for data connectors that do not require additional Auth."
)
first_credential = fetch_credential_by_id(public_cred_id, None, db_session)
first_credential = fetch_credential_by_id(PUBLIC_CREDENTIAL_ID, None, db_session)
if first_credential is not None:
if first_credential.credential_json != {} or first_credential.user is not None:
@ -397,7 +398,7 @@ def create_initial_public_credential(db_session: Session) -> None:
return
credential = Credential(
id=public_cred_id,
id=PUBLIC_CREDENTIAL_ID,
credential_json={},
user_id=None,
)

View File

@ -1,5 +1,6 @@
from collections.abc import Sequence
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from sqlalchemy import and_
@ -66,6 +67,32 @@ def create_index_attempt(
return new_attempt.id
def mock_successful_index_attempt(
connector_credential_pair_id: int,
search_settings_id: int,
docs_indexed: int,
db_session: Session,
) -> int:
"""Should not be used in any user triggered flows"""
db_time = func.now()
new_attempt = IndexAttempt(
connector_credential_pair_id=connector_credential_pair_id,
search_settings_id=search_settings_id,
from_beginning=True,
status=IndexingStatus.SUCCESS,
total_docs_indexed=docs_indexed,
new_docs_indexed=docs_indexed,
# Need this to be some convincing random looking value and it can't be 0
# or the indexing rate would calculate out to infinity
time_started=db_time - timedelta(seconds=1.92),
time_updated=db_time,
)
db_session.add(new_attempt)
db_session.commit()
return new_attempt.id
def get_in_progress_index_attempts(
connector_id: int | None,
db_session: Session,

View File

@ -103,6 +103,9 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
self,
chunks: list[DocAwareChunk],
) -> list[IndexChunk]:
"""Adds embeddings to the chunks, the title and metadata suffixes are added to the chunk as well
if they exist. If there is no space for it, it would have been thrown out at the chunking step.
"""
# All chunks at this point must have some non-empty content
flat_chunk_texts: list[str] = []
large_chunks_present = False
@ -121,6 +124,11 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
flat_chunk_texts.append(chunk_text)
if chunk.mini_chunk_texts:
if chunk.large_chunk_reference_ids:
# A large chunk does not contain mini chunks, if it matches the large chunk
# with a high score, then mini chunks would not be used anyway
# otherwise it should match the normal chunk
raise RuntimeError("Large chunk contains mini chunks")
flat_chunk_texts.extend(chunk.mini_chunk_texts)
embeddings = self.embedding_model.encode(

View File

@ -195,6 +195,8 @@ def index_doc_batch_prepare(
db_session: Session,
ignore_time_skip: bool = False,
) -> DocumentBatchPrepareContext | None:
"""This sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
This preceeds indexing it into the actual document index."""
documents = []
for document in document_batch:
empty_contents = not any(section.text.strip() for section in document.sections)

View File

@ -3,5 +3,6 @@ from danswer.key_value_store.store import PgRedisKVStore
def get_kv_store() -> KeyValueStore:
# this is the only one supported currently
# In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
# It's read from the global thread level variable
return PgRedisKVStore()

View File

@ -14,6 +14,8 @@ class KvKeyNotFoundError(Exception):
class KeyValueStore:
# In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
# It's read from the global thread level variable
@abc.abstractmethod
def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None:
raise NotImplementedError

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,212 @@
import datetime
import json
import os
from typing import cast
from sqlalchemy.orm import Session
from danswer.access.models import default_public_access
from danswer.configs.constants import DEFAULT_BOOST
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import KV_DOCUMENTS_SEEDED_KEY
from danswer.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
from danswer.connectors.models import Document
from danswer.connectors.models import IndexAttemptMetadata
from danswer.connectors.models import InputType
from danswer.connectors.models import Section
from danswer.db.connector import check_connectors_exist
from danswer.db.connector import create_connector
from danswer.db.connector_credential_pair import add_credential_to_connector
from danswer.db.credentials import PUBLIC_CREDENTIAL_ID
from danswer.db.document import check_docs_exist
from danswer.db.enums import AccessType
from danswer.db.enums import ConnectorCredentialPairStatus
from danswer.db.index_attempt import mock_successful_index_attempt
from danswer.db.search_settings import get_current_search_settings
from danswer.document_index.factory import get_default_document_index
from danswer.indexing.indexing_pipeline import index_doc_batch_prepare
from danswer.indexing.models import ChunkEmbedding
from danswer.indexing.models import DocMetadataAwareIndexChunk
from danswer.key_value_store.factory import get_kv_store
from danswer.key_value_store.interface import KvKeyNotFoundError
from danswer.server.documents.models import ConnectorBase
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
logger = setup_logger()
def _create_indexable_chunks(
preprocessed_docs: list[dict],
) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
ids_to_documents = {}
chunks = []
for preprocessed_doc in preprocessed_docs:
document = Document(
id=preprocessed_doc["url"], # For Web connector, the URL is the ID
# The section is not really used past this point since we have already done the other processing
# for the chunking and embedding.
sections=[
Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"])
],
source=DocumentSource.WEB,
semantic_identifier=preprocessed_doc["title"],
metadata={},
doc_updated_at=None,
primary_owners=[],
secondary_owners=[],
)
if preprocessed_doc["chunk_ind"] == 0:
ids_to_documents[document.id] = document
chunk = DocMetadataAwareIndexChunk(
chunk_id=preprocessed_doc["chunk_ind"],
blurb=preprocessed_doc["content"]
.split(".", 1)[0]
.split("!", 1)[0]
.split("?", 1)[0],
content=preprocessed_doc["content"],
source_links={0: preprocessed_doc["url"]},
section_continuation=False,
source_document=document,
title_prefix=preprocessed_doc["title"],
metadata_suffix_semantic="",
metadata_suffix_keyword="",
mini_chunk_texts=None,
large_chunk_reference_ids=[],
embeddings=ChunkEmbedding(
full_embedding=preprocessed_doc["content_embedding"],
mini_chunk_embeddings=[],
),
title_embedding=preprocessed_doc["title_embedding"],
tenant_id=None,
access=default_public_access,
document_sets=set(),
boost=DEFAULT_BOOST,
)
chunks.append(chunk)
return list(ids_to_documents.values()), chunks
def seed_initial_documents(db_session: Session) -> None:
"""
Seed initial documents so users don't have an empty index to start
Documents are only loaded if:
- This is the first setup (if the user deletes the docs, we don't load them again)
- The index is empty, there are no docs and no (non-default) connectors
- The user has not updated the embedding models
- If they do, then we have to actually index the website
- If the embedding model is already updated on server startup, they're not a new user
Note that regardless of any search settings, the default documents are always loaded with
the predetermined chunk sizes and single pass embedding.
Steps are as follows:
- Check if this needs to run
- Create the connector representing this
- Create the cc-pair (attaching the public credential) and mocking values like the last success
- Indexing the documents into Postgres
- Indexing the documents into Vespa
- Create a fake index attempt with fake times
"""
logger.info("Seeding initial documents")
kv_store = get_kv_store()
try:
kv_store.load(KV_DOCUMENTS_SEEDED_KEY)
logger.info("Documents already seeded, skipping")
return
except KvKeyNotFoundError:
pass
if check_docs_exist(db_session):
logger.info("Documents already exist, skipping")
return
if check_connectors_exist(db_session):
logger.info("Connectors already exist, skipping")
return
search_settings = get_current_search_settings(db_session)
if search_settings.model_name != DEFAULT_DOCUMENT_ENCODER_MODEL:
logger.info("Embedding model has been updated, skipping")
return
document_index = get_default_document_index(
primary_index_name=search_settings.index_name, secondary_index_name=None
)
# Create a connector so the user can delete it if they want
# or reindex it with a new search model if they want
connector_data = ConnectorBase(
name="Sample Use Cases",
source=DocumentSource.WEB,
input_type=InputType.LOAD_STATE,
connector_specific_config={
"base_url": "https://docs.danswer.dev/more/use_cases",
"web_connector_type": "recursive",
},
refresh_freq=None, # Never refresh by default
prune_freq=None,
indexing_start=None,
)
connector = create_connector(db_session, connector_data)
connector_id = cast(int, connector.id)
last_index_time = datetime.datetime.now(datetime.timezone.utc)
result = add_credential_to_connector(
db_session=db_session,
user=None,
connector_id=connector_id,
credential_id=PUBLIC_CREDENTIAL_ID,
access_type=AccessType.PUBLIC,
cc_pair_name=connector_data.name,
groups=None,
initial_status=ConnectorCredentialPairStatus.PAUSED,
last_successful_index_time=last_index_time,
)
cc_pair_id = cast(int, result.data)
initial_docs_path = os.path.join(
os.getcwd(), "danswer", "seeding", "initial_docs.json"
)
processed_docs = json.load(open(initial_docs_path))
docs, chunks = _create_indexable_chunks(processed_docs)
index_doc_batch_prepare(
document_batch=docs,
index_attempt_metadata=IndexAttemptMetadata(
connector_id=connector_id,
credential_id=PUBLIC_CREDENTIAL_ID,
),
db_session=db_session,
ignore_time_skip=True, # Doesn't actually matter here
)
# In this case since there are no other connectors running in the background
# and this is a fresh deployment, there is no need to grab any locks
logger.info(
"Indexing seeding documents into Vespa "
"(Vespa may take a few seconds to become ready after receiving the schema)"
)
# Retries here because the index may take a few seconds to become ready
# as we just sent over the Vespa schema and there is a slight delay
index_with_retries = retry_builder()(document_index.index)
index_with_retries(chunks=chunks)
# Mock a run for the UI even though it did not actually call out to anything
mock_successful_index_attempt(
connector_credential_pair_id=cc_pair_id,
search_settings_id=search_settings.id,
docs_indexed=len(docs),
db_session=db_session,
)
kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True)

View File

@ -40,6 +40,7 @@ from danswer.natural_language_processing.search_nlp_models import warm_up_bi_enc
from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder
from danswer.search.models import SavedSearchSettings
from danswer.search.retrieval.search_runner import download_nltk_data
from danswer.seeding.load_docs import seed_initial_documents
from danswer.server.manage.llm.models import LLMProviderUpsertRequest
from danswer.server.settings.store import load_settings
from danswer.server.settings.store import store_settings
@ -58,6 +59,12 @@ logger = setup_logger()
def setup_danswer(db_session: Session) -> None:
"""
Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
on server startup. In the MT case, it will be called when the tenant is created.
The Tenant Service calls the tenants/create endpoint which runs this.
"""
check_index_swap(db_session=db_session)
search_settings = get_current_search_settings(db_session)
secondary_search_settings = get_secondary_search_settings(db_session)
@ -107,7 +114,8 @@ def setup_danswer(db_session: Session) -> None:
if not MULTI_TENANT:
mark_reindex_flag(db_session)
# ensure Vespa is setup correctly
# Ensure Vespa is setup correctly, this step is relatively near the end because Vespa
# takes a bit of time to start up
logger.notice("Verifying Document Index(s) is/are available.")
document_index = get_default_document_index(
primary_index_name=search_settings.index_name,
@ -139,6 +147,8 @@ def setup_danswer(db_session: Session) -> None:
# update multipass indexing setting based on GPU availability
update_default_multipass_indexing(db_session)
seed_initial_documents(db_session)
def translate_saved_search_settings(db_session: Session) -> None:
kv_store = get_kv_store()
@ -311,11 +321,13 @@ def update_default_multipass_indexing(db_session: Session) -> None:
def setup_multitenant_danswer() -> None:
# For Managed Vespa, the schema is sent over via the Vespa Console manually.
if not MANAGED_VESPA:
setup_vespa_multitenant(SUPPORTED_EMBEDDING_MODELS)
def setup_vespa_multitenant(supported_indices: list[SupportedEmbeddingModel]) -> bool:
# This is for local testing
WAIT_SECONDS = 5
VESPA_ATTEMPTS = 5
for x in range(VESPA_ATTEMPTS):

View File

@ -22,18 +22,18 @@ def retry_builder(
jitter: tuple[float, float] | float = 1,
) -> Callable[[F], F]:
"""Builds a generic wrapper/decorator for calls to external APIs that
may fail due to rate limiting, flakes, or other reasons. Applies expontential
may fail due to rate limiting, flakes, or other reasons. Applies exponential
backoff with jitter to retry the call."""
@retry(
tries=tries,
delay=delay,
max_delay=max_delay,
backoff=backoff,
jitter=jitter,
logger=cast(Logger, logger),
)
def retry_with_default(func: F) -> F:
@retry(
tries=tries,
delay=delay,
max_delay=max_delay,
backoff=backoff,
jitter=jitter,
logger=cast(Logger, logger),
)
def wrapped_func(*args: list, **kwargs: dict[str, Any]) -> Any:
return func(*args, **kwargs)

View File

@ -9,7 +9,6 @@ from googleapiclient.errors import HttpError # type: ignore
from sqlalchemy.orm import Session
from danswer.access.models import ExternalAccess
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.factory import instantiate_connector
from danswer.connectors.google_drive.connector_auth import (
get_google_drive_creds,
@ -19,6 +18,7 @@ from danswer.connectors.models import InputType
from danswer.db.models import ConnectorCredentialPair
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
from ee.danswer.db.document import upsert_document_external_perms__no_commit
# Google Drive APIs are quite flakey and may 500 for an

View File

@ -7,7 +7,6 @@ from googleapiclient.discovery import build # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from sqlalchemy.orm import Session
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.google_drive.connector_auth import (
get_google_drive_creds,
)
@ -15,6 +14,7 @@ from danswer.connectors.google_drive.constants import FETCH_GROUPS_SCOPES
from danswer.db.models import ConnectorCredentialPair
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
from danswer.utils.logger import setup_logger
from danswer.utils.retry_wrapper import retry_builder
from ee.danswer.db.external_perm import ExternalUserGroup
from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair__no_commit

View File

@ -0,0 +1,240 @@
# This script preps the documents used for initially seeding the index. It handles the embedding so that the
# documents can be added to the index with minimal processing.
import json
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer # type: ignore
class SeedPresaveDocument(BaseModel):
url: str
title: str
content: str
title_embedding: list[float]
content_embedding: list[float]
chunk_ind: int = 0
# Be sure to use the default embedding model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
tokenizer = model.tokenizer
# This is easier than cleaning up the crawl, needs to be updated if the sites are changed
overview_title = "Use Cases Overview"
overview = (
"How to leverage Danswer in your organization\n\n"
"Danswer Overview\n"
"Danswer is the AI Assistant connected to your organization's docs, apps, and people. "
"Danswer makes Generative AI more versatile for work by enabling new types of questions like "
'"What is the most common feature request we\'ve heard from customers this month". '
"Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, "
"Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\n"
"Danswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. "
"The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\n"
"Common Getting Started Questions:\n\n"
"Why are these docs connected in my Danswer deployment?\n"
"Answer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge "
"and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date "
"and in sync with your connected applications.\n\n"
"Is my data being sent anywhere when I connect it up to Danswer?\n"
"Answer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know "
"exactly what is going on with their data. By default all of the document processing happens within Danswer. "
"The only time it is sent outward is for the GenAI call to generate answers.\n\n"
"Where is the feature for auto sync-ing document level access permissions from all connected sources?\n"
"Answer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. "
"If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the "
"Danswer team to receive access."
)
enterprise_search_title = "Enterprise Search"
enterprise_search_1 = (
"Value of Enterprise Search with Danswer\n\n"
"What is Enterprise Search and why is it Important?\n"
"An Enterprise Search system gives team members a single place to access all of the disparate knowledge "
"of an organization. Critical information is saved across a host of channels like call transcripts with "
"prospects, engineering design docs, IT runbooks, customer support email exchanges, project management "
"tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\n"
"Since it quickly becomes infeasible to check across every source, decisions get made on incomplete "
"information, employee satisfaction decreases, and the most valuable members of your team are tied up "
"with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this "
"problem by letting anyone on the team access all of the knowledge across your organization in a "
"permissioned and secure way. Users can ask questions in natural language and get back answers and "
"documents across all of the connected sources instantly.\n\n"
"What's the real cost?\n"
"A typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of "
"incomplete or incorrect information can be extremely high. Customer support/success that isn't able "
"to find the reference to similar cases could cause hours or even days of delay leading to lower "
"customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had "
"previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar "
"feature had previously been built could result in weeks of wasted development time and tech debt with "
"duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark "
"- inefficient and mistake prone."
)
enterprise_search_2 = (
"More than Search\n"
"When analyzing the entire corpus of knowledge within your company is as easy as asking a question "
"in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial "
"to identify where knowledge is well documented and where it is lacking. Team members who are centers "
"of knowledge can begin to effectively document their expertise since it is no longer being thrown into "
"a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\n"
"With Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar "
"cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even "
"the most junior members can understand it. This in turn lets them give the most holistic and technically accurate "
"response possible to your customers. On the other end, even the super stars of your sales team will not be able "
"to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it "
"in mere seconds and give crucial context to help your team close."
)
ai_platform_title = "AI Platform"
ai_platform = (
"Build AI Agents powered by the knowledge and workflows specific to your organization.\n\n"
"Beyond Answers\n"
"Agents enabled by generative AI and reasoning capable models are helping teams to automate their work. "
"Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, "
"handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\n"
"Danswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your "
"team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\n"
"Flexibility and Extensibility\n"
"Danswer is open source and completely whitebox. This not only gives transparency to what happens within the system "
"but also means that your team can directly modify the source code to suit your unique needs."
)
customer_support_title = "Customer Support"
customer_support = (
"Help your customer support team instantly answer any question across your entire product.\n\n"
"AI Enabled Support\n"
"Customer support agents have one of the highest breadth jobs. They field requests that cover the entire surface "
"area of the product and need to help your users find success on extremely short timelines. "
"Because they're not the same people who designed or built the system, they often lack the depth of understanding "
"needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team "
"optimize the speed and quality of these critical customer-facing interactions.\n\n"
"The Importance of Context\n"
"There are two critical components of AI copilots for customer support. The first is that the AI system needs to be "
"connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the "
"knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as "
"pull requests in a code repository. The second critical component is the ability of the AI system to break down "
"difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able "
"to chat back and forth with the system to build a better understanding.\n\n"
"Danswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is "
"pulled in constantly so that the information access is always up to date."
)
sales_title = "Sales"
sales = (
"Keep your team up to date on every conversation and update so they can close.\n\n"
"Recall Every Detail\n"
"Being able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide "
"more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through "
'hours of transcripts in preparation for a call, your team can now ask Danswer "What specific features was ACME '
"interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, "
"Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant "
"information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building "
'topics by asking "What rapport building topic did we chat about in the last call with ACME".\n\n'
"Know Every Product Update\n"
"It is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question "
"that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an "
"authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the "
"prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live "
'on call because of how fast accessing information becomes. A question like "Have we shipped the Microsoft AD '
'integration yet?" can now be answered in seconds meaning that prospects can get answers while on the call instead of '
"asynchronously and sales cycles are reduced as a result."
)
operations_title = "Operations"
operations = (
"Double the productivity of your Ops teams like IT, HR, etc.\n\n"
"Automatically Resolve Tickets\n"
"Modern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits "
"details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to "
"do the real impactful work of landing star candidates or improving your internal processes.\n\n"
"AI Aided Onboarding\n"
"One of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens "
"of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to "
"set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help "
"of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to "
"focus on moving the needle."
)
# For simplicity, we're not adding any metadata suffix here. Generally there is none for the Web connector anyway
overview_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/overview",
title=overview_title,
content=overview,
title_embedding=model.encode(f"search_document: {overview_title}"),
content_embedding=model.encode(f"search_document: {overview_title}\n{overview}"),
)
enterprise_search_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/enterprise_search",
title=enterprise_search_title,
content=enterprise_search_1,
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
content_embedding=model.encode(
f"search_document: {enterprise_search_title}\n{enterprise_search_1}"
),
)
enterprise_search_doc_2 = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/enterprise_search",
title=enterprise_search_title,
content=enterprise_search_2,
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
content_embedding=model.encode(
f"search_document: {enterprise_search_title}\n{enterprise_search_2}"
),
chunk_ind=1,
)
ai_platform_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/ai_platform",
title=ai_platform_title,
content=ai_platform,
title_embedding=model.encode(f"search_document: {ai_platform_title}"),
content_embedding=model.encode(
f"search_document: {ai_platform_title}\n{ai_platform}"
),
)
customer_support_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/customer_support",
title=customer_support_title,
content=customer_support,
title_embedding=model.encode(f"search_document: {customer_support_title}"),
content_embedding=model.encode(
f"search_document: {customer_support_title}\n{customer_support}"
),
)
sales_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/sales",
title=sales_title,
content=sales,
title_embedding=model.encode(f"search_document: {sales_title}"),
content_embedding=model.encode(f"search_document: {sales_title}\n{sales}"),
)
operations_doc = SeedPresaveDocument(
url="https://docs.danswer.dev/more/use_cases/operations",
title=operations_title,
content=operations,
title_embedding=model.encode(f"search_document: {operations_title}"),
content_embedding=model.encode(
f"search_document: {operations_title}\n{operations}"
),
)
documents = [
overview_doc,
enterprise_search_doc,
enterprise_search_doc_2,
ai_platform_doc,
customer_support_doc,
sales_doc,
operations_doc,
]
documents_dict = [doc.model_dump() for doc in documents]
with open("./backend/danswer/seeding/initial_docs.json", "w") as json_file:
json.dump(documents_dict, json_file, indent=4)

View File

@ -73,9 +73,7 @@ def _clear_slack_conversation_members(
if member_id == admin_user_id:
continue
try:
make_slack_api_call_w_retries(
slack_client.conversations_kick, channel=channel_id, user=member_id
)
slack_client.conversations_kick(channel=channel_id, user=member_id)
print(f"Kicked member: {member_id}")
except Exception as e:
if "cant_kick_self" in str(e):
@ -83,9 +81,7 @@ def _clear_slack_conversation_members(
print(f"Error kicking member: {e}")
print(member_id)
try:
make_slack_api_call_w_retries(
slack_client.conversations_unarchive, channel=channel_id
)
slack_client.conversations_unarchive(channel=channel_id)
channel["is_archived"] = False
except Exception:
# Channel is already unarchived
@ -98,11 +94,7 @@ def _add_slack_conversation_members(
channel_id = _get_slack_channel_id(channel)
for user_id in member_ids:
try:
make_slack_api_call_w_retries(
slack_client.conversations_invite,
channel=channel_id,
users=user_id,
)
slack_client.conversations_invite(channel=channel_id, users=user_id)
except Exception as e:
if "already_in_channel" in str(e):
continue
@ -129,11 +121,7 @@ def _delete_slack_conversation_messages(
try:
if not (ts := message.get("ts")):
raise ValueError("Message timestamp is missing")
make_slack_api_call_w_retries(
slack_client.chat_delete,
channel=channel_id,
ts=ts,
)
slack_client.chat_delete(channel=channel_id, ts=ts)
except Exception as e:
print(f"Error deleting message: {e}")
print(message)
@ -165,16 +153,12 @@ def _build_slack_channel_from_name(
)
try:
channel_response = make_slack_api_call_w_retries(
slack_client.conversations_unarchive,
channel=channel_response["channel"]["id"],
)
slack_client.conversations_unarchive(channel=channel_response["channel"]["id"])
except Exception:
# Channel is already unarchived
pass
try:
channel_response = make_slack_api_call_w_retries(
slack_client.conversations_invite,
slack_client.conversations_invite(
channel=channel_response["channel"]["id"],
users=[admin_user_id],
)
@ -302,10 +286,6 @@ class SlackManager:
# "done" in the channel name indicates that this channel is free to be used for a new test
new_name = f"done_{str(uuid4())}"
try:
make_slack_api_call_w_retries(
slack_client.conversations_rename,
channel=channel["id"],
name=new_name,
)
slack_client.conversations_rename(channel=channel["id"], name=new_name)
except SlackApiError as e:
print(f"Error renaming channel {channel['id']}: {e}")