mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 15:30:59 +02:00
Seeding (#2902)
* checkpoint * k * k * k * fixed slack api calls * missed one --------- Co-authored-by: hagen-danswer <hagen@danswer.ai>
This commit is contained in:
parent
9f50417109
commit
b49a9ab171
@ -70,3 +70,12 @@ class DocumentAccess(ExternalAccess):
|
||||
user_groups=set(user_groups),
|
||||
is_public=is_public,
|
||||
)
|
||||
|
||||
|
||||
default_public_access = DocumentAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
user_emails=set(),
|
||||
user_groups=set(),
|
||||
is_public=True,
|
||||
)
|
||||
|
@ -70,6 +70,7 @@ KV_CUSTOMER_UUID_KEY = "customer_uuid"
|
||||
KV_INSTANCE_DOMAIN_KEY = "instance_domain"
|
||||
KV_ENTERPRISE_SETTINGS_KEY = "danswer_enterprise_settings"
|
||||
KV_CUSTOM_ANALYTICS_SCRIPT_KEY = "__custom_analytics_script__"
|
||||
KV_DOCUMENTS_SEEDED_KEY = "documents_seeded"
|
||||
|
||||
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 60
|
||||
CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
|
||||
|
@ -15,7 +15,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
@ -10,7 +10,6 @@ from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -19,6 +18,7 @@ from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
CLICKUP_API_BASE_URL = "https://api.clickup.com/api/v2"
|
||||
|
@ -14,7 +14,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
@ -11,7 +11,6 @@ from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.document360.utils import flatten_child_categories
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
@ -22,6 +21,7 @@ from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.file_processing.html_utils import parse_html_page_basic
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
# Limitations and Potential Improvements
|
||||
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
|
||||
|
@ -19,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.google_drive.connector_auth import get_google_drive_creds
|
||||
from danswer.connectors.google_drive.constants import (
|
||||
DB_CREDENTIALS_DICT_DELEGATED_USER_KEY,
|
||||
@ -40,6 +39,7 @@ from danswer.file_processing.unstructured import get_unstructured_api_key
|
||||
from danswer.file_processing.unstructured import unstructured_to_text
|
||||
from danswer.utils.batching import batch_generator
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
@ -10,9 +10,9 @@ from slack_sdk import WebClient
|
||||
from slack_sdk.errors import SlackApiError
|
||||
from slack_sdk.web import SlackResponse
|
||||
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
@ -341,6 +341,8 @@ def add_credential_to_connector(
|
||||
access_type: AccessType,
|
||||
groups: list[int] | None,
|
||||
auto_sync_options: dict | None = None,
|
||||
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
|
||||
last_successful_index_time: datetime | None = None,
|
||||
) -> StatusResponse:
|
||||
connector = fetch_connector_by_id(connector_id, db_session)
|
||||
credential = fetch_credential_by_id(credential_id, user, db_session)
|
||||
@ -384,9 +386,10 @@ def add_credential_to_connector(
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
name=cc_pair_name,
|
||||
status=ConnectorCredentialPairStatus.ACTIVE,
|
||||
status=initial_status,
|
||||
access_type=access_type,
|
||||
auto_sync_options=auto_sync_options,
|
||||
last_successful_index_time=last_successful_index_time,
|
||||
)
|
||||
db_session.add(association)
|
||||
db_session.flush() # make sure the association has an id
|
||||
|
@ -40,6 +40,8 @@ CREDENTIAL_PERMISSIONS_TO_IGNORE = {
|
||||
DocumentSource.MEDIAWIKI,
|
||||
}
|
||||
|
||||
PUBLIC_CREDENTIAL_ID = 0
|
||||
|
||||
|
||||
def _add_user_filters(
|
||||
stmt: Select,
|
||||
@ -384,12 +386,11 @@ def delete_credential(
|
||||
|
||||
|
||||
def create_initial_public_credential(db_session: Session) -> None:
|
||||
public_cred_id = 0
|
||||
error_msg = (
|
||||
"DB is not in a valid initial state."
|
||||
"There must exist an empty public credential for data connectors that do not require additional Auth."
|
||||
)
|
||||
first_credential = fetch_credential_by_id(public_cred_id, None, db_session)
|
||||
first_credential = fetch_credential_by_id(PUBLIC_CREDENTIAL_ID, None, db_session)
|
||||
|
||||
if first_credential is not None:
|
||||
if first_credential.credential_json != {} or first_credential.user is not None:
|
||||
@ -397,7 +398,7 @@ def create_initial_public_credential(db_session: Session) -> None:
|
||||
return
|
||||
|
||||
credential = Credential(
|
||||
id=public_cred_id,
|
||||
id=PUBLIC_CREDENTIAL_ID,
|
||||
credential_json={},
|
||||
user_id=None,
|
||||
)
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
|
||||
from sqlalchemy import and_
|
||||
@ -66,6 +67,32 @@ def create_index_attempt(
|
||||
return new_attempt.id
|
||||
|
||||
|
||||
def mock_successful_index_attempt(
|
||||
connector_credential_pair_id: int,
|
||||
search_settings_id: int,
|
||||
docs_indexed: int,
|
||||
db_session: Session,
|
||||
) -> int:
|
||||
"""Should not be used in any user triggered flows"""
|
||||
db_time = func.now()
|
||||
new_attempt = IndexAttempt(
|
||||
connector_credential_pair_id=connector_credential_pair_id,
|
||||
search_settings_id=search_settings_id,
|
||||
from_beginning=True,
|
||||
status=IndexingStatus.SUCCESS,
|
||||
total_docs_indexed=docs_indexed,
|
||||
new_docs_indexed=docs_indexed,
|
||||
# Need this to be some convincing random looking value and it can't be 0
|
||||
# or the indexing rate would calculate out to infinity
|
||||
time_started=db_time - timedelta(seconds=1.92),
|
||||
time_updated=db_time,
|
||||
)
|
||||
db_session.add(new_attempt)
|
||||
db_session.commit()
|
||||
|
||||
return new_attempt.id
|
||||
|
||||
|
||||
def get_in_progress_index_attempts(
|
||||
connector_id: int | None,
|
||||
db_session: Session,
|
||||
|
@ -103,6 +103,9 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
||||
self,
|
||||
chunks: list[DocAwareChunk],
|
||||
) -> list[IndexChunk]:
|
||||
"""Adds embeddings to the chunks, the title and metadata suffixes are added to the chunk as well
|
||||
if they exist. If there is no space for it, it would have been thrown out at the chunking step.
|
||||
"""
|
||||
# All chunks at this point must have some non-empty content
|
||||
flat_chunk_texts: list[str] = []
|
||||
large_chunks_present = False
|
||||
@ -121,6 +124,11 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
||||
flat_chunk_texts.append(chunk_text)
|
||||
|
||||
if chunk.mini_chunk_texts:
|
||||
if chunk.large_chunk_reference_ids:
|
||||
# A large chunk does not contain mini chunks, if it matches the large chunk
|
||||
# with a high score, then mini chunks would not be used anyway
|
||||
# otherwise it should match the normal chunk
|
||||
raise RuntimeError("Large chunk contains mini chunks")
|
||||
flat_chunk_texts.extend(chunk.mini_chunk_texts)
|
||||
|
||||
embeddings = self.embedding_model.encode(
|
||||
|
@ -195,6 +195,8 @@ def index_doc_batch_prepare(
|
||||
db_session: Session,
|
||||
ignore_time_skip: bool = False,
|
||||
) -> DocumentBatchPrepareContext | None:
|
||||
"""This sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
|
||||
This preceeds indexing it into the actual document index."""
|
||||
documents = []
|
||||
for document in document_batch:
|
||||
empty_contents = not any(section.text.strip() for section in document.sections)
|
||||
|
@ -3,5 +3,6 @@ from danswer.key_value_store.store import PgRedisKVStore
|
||||
|
||||
|
||||
def get_kv_store() -> KeyValueStore:
|
||||
# this is the only one supported currently
|
||||
# In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
|
||||
# It's read from the global thread level variable
|
||||
return PgRedisKVStore()
|
||||
|
@ -14,6 +14,8 @@ class KvKeyNotFoundError(Exception):
|
||||
|
||||
|
||||
class KeyValueStore:
|
||||
# In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
|
||||
# It's read from the global thread level variable
|
||||
@abc.abstractmethod
|
||||
def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None:
|
||||
raise NotImplementedError
|
||||
|
0
backend/danswer/seeding/__init__.py
Normal file
0
backend/danswer/seeding/__init__.py
Normal file
10824
backend/danswer/seeding/initial_docs.json
Normal file
10824
backend/danswer/seeding/initial_docs.json
Normal file
File diff suppressed because it is too large
Load Diff
212
backend/danswer/seeding/load_docs.py
Normal file
212
backend/danswer/seeding/load_docs.py
Normal file
@ -0,0 +1,212 @@
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
from typing import cast
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.models import default_public_access
|
||||
from danswer.configs.constants import DEFAULT_BOOST
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import KV_DOCUMENTS_SEEDED_KEY
|
||||
from danswer.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import IndexAttemptMetadata
|
||||
from danswer.connectors.models import InputType
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.db.connector import check_connectors_exist
|
||||
from danswer.db.connector import create_connector
|
||||
from danswer.db.connector_credential_pair import add_credential_to_connector
|
||||
from danswer.db.credentials import PUBLIC_CREDENTIAL_ID
|
||||
from danswer.db.document import check_docs_exist
|
||||
from danswer.db.enums import AccessType
|
||||
from danswer.db.enums import ConnectorCredentialPairStatus
|
||||
from danswer.db.index_attempt import mock_successful_index_attempt
|
||||
from danswer.db.search_settings import get_current_search_settings
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.indexing.indexing_pipeline import index_doc_batch_prepare
|
||||
from danswer.indexing.models import ChunkEmbedding
|
||||
from danswer.indexing.models import DocMetadataAwareIndexChunk
|
||||
from danswer.key_value_store.factory import get_kv_store
|
||||
from danswer.key_value_store.interface import KvKeyNotFoundError
|
||||
from danswer.server.documents.models import ConnectorBase
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _create_indexable_chunks(
|
||||
preprocessed_docs: list[dict],
|
||||
) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
|
||||
ids_to_documents = {}
|
||||
chunks = []
|
||||
for preprocessed_doc in preprocessed_docs:
|
||||
document = Document(
|
||||
id=preprocessed_doc["url"], # For Web connector, the URL is the ID
|
||||
# The section is not really used past this point since we have already done the other processing
|
||||
# for the chunking and embedding.
|
||||
sections=[
|
||||
Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"])
|
||||
],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=preprocessed_doc["title"],
|
||||
metadata={},
|
||||
doc_updated_at=None,
|
||||
primary_owners=[],
|
||||
secondary_owners=[],
|
||||
)
|
||||
if preprocessed_doc["chunk_ind"] == 0:
|
||||
ids_to_documents[document.id] = document
|
||||
|
||||
chunk = DocMetadataAwareIndexChunk(
|
||||
chunk_id=preprocessed_doc["chunk_ind"],
|
||||
blurb=preprocessed_doc["content"]
|
||||
.split(".", 1)[0]
|
||||
.split("!", 1)[0]
|
||||
.split("?", 1)[0],
|
||||
content=preprocessed_doc["content"],
|
||||
source_links={0: preprocessed_doc["url"]},
|
||||
section_continuation=False,
|
||||
source_document=document,
|
||||
title_prefix=preprocessed_doc["title"],
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
mini_chunk_texts=None,
|
||||
large_chunk_reference_ids=[],
|
||||
embeddings=ChunkEmbedding(
|
||||
full_embedding=preprocessed_doc["content_embedding"],
|
||||
mini_chunk_embeddings=[],
|
||||
),
|
||||
title_embedding=preprocessed_doc["title_embedding"],
|
||||
tenant_id=None,
|
||||
access=default_public_access,
|
||||
document_sets=set(),
|
||||
boost=DEFAULT_BOOST,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
return list(ids_to_documents.values()), chunks
|
||||
|
||||
|
||||
def seed_initial_documents(db_session: Session) -> None:
|
||||
"""
|
||||
Seed initial documents so users don't have an empty index to start
|
||||
|
||||
Documents are only loaded if:
|
||||
- This is the first setup (if the user deletes the docs, we don't load them again)
|
||||
- The index is empty, there are no docs and no (non-default) connectors
|
||||
- The user has not updated the embedding models
|
||||
- If they do, then we have to actually index the website
|
||||
- If the embedding model is already updated on server startup, they're not a new user
|
||||
|
||||
Note that regardless of any search settings, the default documents are always loaded with
|
||||
the predetermined chunk sizes and single pass embedding.
|
||||
|
||||
Steps are as follows:
|
||||
- Check if this needs to run
|
||||
- Create the connector representing this
|
||||
- Create the cc-pair (attaching the public credential) and mocking values like the last success
|
||||
- Indexing the documents into Postgres
|
||||
- Indexing the documents into Vespa
|
||||
- Create a fake index attempt with fake times
|
||||
"""
|
||||
logger.info("Seeding initial documents")
|
||||
|
||||
kv_store = get_kv_store()
|
||||
try:
|
||||
kv_store.load(KV_DOCUMENTS_SEEDED_KEY)
|
||||
logger.info("Documents already seeded, skipping")
|
||||
return
|
||||
except KvKeyNotFoundError:
|
||||
pass
|
||||
|
||||
if check_docs_exist(db_session):
|
||||
logger.info("Documents already exist, skipping")
|
||||
return
|
||||
|
||||
if check_connectors_exist(db_session):
|
||||
logger.info("Connectors already exist, skipping")
|
||||
return
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
if search_settings.model_name != DEFAULT_DOCUMENT_ENCODER_MODEL:
|
||||
logger.info("Embedding model has been updated, skipping")
|
||||
return
|
||||
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
# Create a connector so the user can delete it if they want
|
||||
# or reindex it with a new search model if they want
|
||||
connector_data = ConnectorBase(
|
||||
name="Sample Use Cases",
|
||||
source=DocumentSource.WEB,
|
||||
input_type=InputType.LOAD_STATE,
|
||||
connector_specific_config={
|
||||
"base_url": "https://docs.danswer.dev/more/use_cases",
|
||||
"web_connector_type": "recursive",
|
||||
},
|
||||
refresh_freq=None, # Never refresh by default
|
||||
prune_freq=None,
|
||||
indexing_start=None,
|
||||
)
|
||||
|
||||
connector = create_connector(db_session, connector_data)
|
||||
connector_id = cast(int, connector.id)
|
||||
|
||||
last_index_time = datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
result = add_credential_to_connector(
|
||||
db_session=db_session,
|
||||
user=None,
|
||||
connector_id=connector_id,
|
||||
credential_id=PUBLIC_CREDENTIAL_ID,
|
||||
access_type=AccessType.PUBLIC,
|
||||
cc_pair_name=connector_data.name,
|
||||
groups=None,
|
||||
initial_status=ConnectorCredentialPairStatus.PAUSED,
|
||||
last_successful_index_time=last_index_time,
|
||||
)
|
||||
cc_pair_id = cast(int, result.data)
|
||||
|
||||
initial_docs_path = os.path.join(
|
||||
os.getcwd(), "danswer", "seeding", "initial_docs.json"
|
||||
)
|
||||
processed_docs = json.load(open(initial_docs_path))
|
||||
|
||||
docs, chunks = _create_indexable_chunks(processed_docs)
|
||||
|
||||
index_doc_batch_prepare(
|
||||
document_batch=docs,
|
||||
index_attempt_metadata=IndexAttemptMetadata(
|
||||
connector_id=connector_id,
|
||||
credential_id=PUBLIC_CREDENTIAL_ID,
|
||||
),
|
||||
db_session=db_session,
|
||||
ignore_time_skip=True, # Doesn't actually matter here
|
||||
)
|
||||
|
||||
# In this case since there are no other connectors running in the background
|
||||
# and this is a fresh deployment, there is no need to grab any locks
|
||||
logger.info(
|
||||
"Indexing seeding documents into Vespa "
|
||||
"(Vespa may take a few seconds to become ready after receiving the schema)"
|
||||
)
|
||||
|
||||
# Retries here because the index may take a few seconds to become ready
|
||||
# as we just sent over the Vespa schema and there is a slight delay
|
||||
index_with_retries = retry_builder()(document_index.index)
|
||||
index_with_retries(chunks=chunks)
|
||||
|
||||
# Mock a run for the UI even though it did not actually call out to anything
|
||||
mock_successful_index_attempt(
|
||||
connector_credential_pair_id=cc_pair_id,
|
||||
search_settings_id=search_settings.id,
|
||||
docs_indexed=len(docs),
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True)
|
@ -40,6 +40,7 @@ from danswer.natural_language_processing.search_nlp_models import warm_up_bi_enc
|
||||
from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder
|
||||
from danswer.search.models import SavedSearchSettings
|
||||
from danswer.search.retrieval.search_runner import download_nltk_data
|
||||
from danswer.seeding.load_docs import seed_initial_documents
|
||||
from danswer.server.manage.llm.models import LLMProviderUpsertRequest
|
||||
from danswer.server.settings.store import load_settings
|
||||
from danswer.server.settings.store import store_settings
|
||||
@ -58,6 +59,12 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def setup_danswer(db_session: Session) -> None:
|
||||
"""
|
||||
Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
|
||||
on server startup. In the MT case, it will be called when the tenant is created.
|
||||
|
||||
The Tenant Service calls the tenants/create endpoint which runs this.
|
||||
"""
|
||||
check_index_swap(db_session=db_session)
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
@ -107,7 +114,8 @@ def setup_danswer(db_session: Session) -> None:
|
||||
if not MULTI_TENANT:
|
||||
mark_reindex_flag(db_session)
|
||||
|
||||
# ensure Vespa is setup correctly
|
||||
# Ensure Vespa is setup correctly, this step is relatively near the end because Vespa
|
||||
# takes a bit of time to start up
|
||||
logger.notice("Verifying Document Index(s) is/are available.")
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name,
|
||||
@ -139,6 +147,8 @@ def setup_danswer(db_session: Session) -> None:
|
||||
# update multipass indexing setting based on GPU availability
|
||||
update_default_multipass_indexing(db_session)
|
||||
|
||||
seed_initial_documents(db_session)
|
||||
|
||||
|
||||
def translate_saved_search_settings(db_session: Session) -> None:
|
||||
kv_store = get_kv_store()
|
||||
@ -311,11 +321,13 @@ def update_default_multipass_indexing(db_session: Session) -> None:
|
||||
|
||||
|
||||
def setup_multitenant_danswer() -> None:
|
||||
# For Managed Vespa, the schema is sent over via the Vespa Console manually.
|
||||
if not MANAGED_VESPA:
|
||||
setup_vespa_multitenant(SUPPORTED_EMBEDDING_MODELS)
|
||||
|
||||
|
||||
def setup_vespa_multitenant(supported_indices: list[SupportedEmbeddingModel]) -> bool:
|
||||
# This is for local testing
|
||||
WAIT_SECONDS = 5
|
||||
VESPA_ATTEMPTS = 5
|
||||
for x in range(VESPA_ATTEMPTS):
|
||||
|
@ -22,18 +22,18 @@ def retry_builder(
|
||||
jitter: tuple[float, float] | float = 1,
|
||||
) -> Callable[[F], F]:
|
||||
"""Builds a generic wrapper/decorator for calls to external APIs that
|
||||
may fail due to rate limiting, flakes, or other reasons. Applies expontential
|
||||
may fail due to rate limiting, flakes, or other reasons. Applies exponential
|
||||
backoff with jitter to retry the call."""
|
||||
|
||||
@retry(
|
||||
tries=tries,
|
||||
delay=delay,
|
||||
max_delay=max_delay,
|
||||
backoff=backoff,
|
||||
jitter=jitter,
|
||||
logger=cast(Logger, logger),
|
||||
)
|
||||
def retry_with_default(func: F) -> F:
|
||||
@retry(
|
||||
tries=tries,
|
||||
delay=delay,
|
||||
max_delay=max_delay,
|
||||
backoff=backoff,
|
||||
jitter=jitter,
|
||||
logger=cast(Logger, logger),
|
||||
)
|
||||
def wrapped_func(*args: list, **kwargs: dict[str, Any]) -> Any:
|
||||
return func(*args, **kwargs)
|
||||
|
@ -9,7 +9,6 @@ from googleapiclient.errors import HttpError # type: ignore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.models import ExternalAccess
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.factory import instantiate_connector
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
get_google_drive_creds,
|
||||
@ -19,6 +18,7 @@ from danswer.connectors.models import InputType
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
from ee.danswer.db.document import upsert_document_external_perms__no_commit
|
||||
|
||||
# Google Drive APIs are quite flakey and may 500 for an
|
||||
|
@ -7,7 +7,6 @@ from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
get_google_drive_creds,
|
||||
)
|
||||
@ -15,6 +14,7 @@ from danswer.connectors.google_drive.constants import FETCH_GROUPS_SCOPES
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
from ee.danswer.db.external_perm import ExternalUserGroup
|
||||
from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair__no_commit
|
||||
|
||||
|
240
backend/scripts/document_seeding_prep.py
Normal file
240
backend/scripts/document_seeding_prep.py
Normal file
@ -0,0 +1,240 @@
|
||||
# This script preps the documents used for initially seeding the index. It handles the embedding so that the
|
||||
# documents can be added to the index with minimal processing.
|
||||
import json
|
||||
|
||||
from pydantic import BaseModel
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
|
||||
|
||||
class SeedPresaveDocument(BaseModel):
|
||||
url: str
|
||||
title: str
|
||||
content: str
|
||||
title_embedding: list[float]
|
||||
content_embedding: list[float]
|
||||
chunk_ind: int = 0
|
||||
|
||||
|
||||
# Be sure to use the default embedding model
|
||||
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
|
||||
tokenizer = model.tokenizer
|
||||
|
||||
# This is easier than cleaning up the crawl, needs to be updated if the sites are changed
|
||||
overview_title = "Use Cases Overview"
|
||||
overview = (
|
||||
"How to leverage Danswer in your organization\n\n"
|
||||
"Danswer Overview\n"
|
||||
"Danswer is the AI Assistant connected to your organization's docs, apps, and people. "
|
||||
"Danswer makes Generative AI more versatile for work by enabling new types of questions like "
|
||||
'"What is the most common feature request we\'ve heard from customers this month". '
|
||||
"Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, "
|
||||
"Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\n"
|
||||
"Danswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. "
|
||||
"The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\n"
|
||||
"Common Getting Started Questions:\n\n"
|
||||
"Why are these docs connected in my Danswer deployment?\n"
|
||||
"Answer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge "
|
||||
"and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date "
|
||||
"and in sync with your connected applications.\n\n"
|
||||
"Is my data being sent anywhere when I connect it up to Danswer?\n"
|
||||
"Answer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know "
|
||||
"exactly what is going on with their data. By default all of the document processing happens within Danswer. "
|
||||
"The only time it is sent outward is for the GenAI call to generate answers.\n\n"
|
||||
"Where is the feature for auto sync-ing document level access permissions from all connected sources?\n"
|
||||
"Answer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. "
|
||||
"If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the "
|
||||
"Danswer team to receive access."
|
||||
)
|
||||
|
||||
enterprise_search_title = "Enterprise Search"
|
||||
enterprise_search_1 = (
|
||||
"Value of Enterprise Search with Danswer\n\n"
|
||||
"What is Enterprise Search and why is it Important?\n"
|
||||
"An Enterprise Search system gives team members a single place to access all of the disparate knowledge "
|
||||
"of an organization. Critical information is saved across a host of channels like call transcripts with "
|
||||
"prospects, engineering design docs, IT runbooks, customer support email exchanges, project management "
|
||||
"tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\n"
|
||||
"Since it quickly becomes infeasible to check across every source, decisions get made on incomplete "
|
||||
"information, employee satisfaction decreases, and the most valuable members of your team are tied up "
|
||||
"with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this "
|
||||
"problem by letting anyone on the team access all of the knowledge across your organization in a "
|
||||
"permissioned and secure way. Users can ask questions in natural language and get back answers and "
|
||||
"documents across all of the connected sources instantly.\n\n"
|
||||
"What's the real cost?\n"
|
||||
"A typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of "
|
||||
"incomplete or incorrect information can be extremely high. Customer support/success that isn't able "
|
||||
"to find the reference to similar cases could cause hours or even days of delay leading to lower "
|
||||
"customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had "
|
||||
"previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar "
|
||||
"feature had previously been built could result in weeks of wasted development time and tech debt with "
|
||||
"duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark "
|
||||
"- inefficient and mistake prone."
|
||||
)
|
||||
|
||||
enterprise_search_2 = (
|
||||
"More than Search\n"
|
||||
"When analyzing the entire corpus of knowledge within your company is as easy as asking a question "
|
||||
"in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial "
|
||||
"to identify where knowledge is well documented and where it is lacking. Team members who are centers "
|
||||
"of knowledge can begin to effectively document their expertise since it is no longer being thrown into "
|
||||
"a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\n"
|
||||
"With Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar "
|
||||
"cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even "
|
||||
"the most junior members can understand it. This in turn lets them give the most holistic and technically accurate "
|
||||
"response possible to your customers. On the other end, even the super stars of your sales team will not be able "
|
||||
"to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it "
|
||||
"in mere seconds and give crucial context to help your team close."
|
||||
)
|
||||
|
||||
ai_platform_title = "AI Platform"
|
||||
ai_platform = (
|
||||
"Build AI Agents powered by the knowledge and workflows specific to your organization.\n\n"
|
||||
"Beyond Answers\n"
|
||||
"Agents enabled by generative AI and reasoning capable models are helping teams to automate their work. "
|
||||
"Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, "
|
||||
"handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\n"
|
||||
"Danswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your "
|
||||
"team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\n"
|
||||
"Flexibility and Extensibility\n"
|
||||
"Danswer is open source and completely whitebox. This not only gives transparency to what happens within the system "
|
||||
"but also means that your team can directly modify the source code to suit your unique needs."
|
||||
)
|
||||
|
||||
customer_support_title = "Customer Support"
|
||||
customer_support = (
|
||||
"Help your customer support team instantly answer any question across your entire product.\n\n"
|
||||
"AI Enabled Support\n"
|
||||
"Customer support agents have one of the highest breadth jobs. They field requests that cover the entire surface "
|
||||
"area of the product and need to help your users find success on extremely short timelines. "
|
||||
"Because they're not the same people who designed or built the system, they often lack the depth of understanding "
|
||||
"needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team "
|
||||
"optimize the speed and quality of these critical customer-facing interactions.\n\n"
|
||||
"The Importance of Context\n"
|
||||
"There are two critical components of AI copilots for customer support. The first is that the AI system needs to be "
|
||||
"connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the "
|
||||
"knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as "
|
||||
"pull requests in a code repository. The second critical component is the ability of the AI system to break down "
|
||||
"difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able "
|
||||
"to chat back and forth with the system to build a better understanding.\n\n"
|
||||
"Danswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is "
|
||||
"pulled in constantly so that the information access is always up to date."
|
||||
)
|
||||
|
||||
sales_title = "Sales"
|
||||
sales = (
|
||||
"Keep your team up to date on every conversation and update so they can close.\n\n"
|
||||
"Recall Every Detail\n"
|
||||
"Being able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide "
|
||||
"more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through "
|
||||
'hours of transcripts in preparation for a call, your team can now ask Danswer "What specific features was ACME '
|
||||
"interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, "
|
||||
"Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant "
|
||||
"information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building "
|
||||
'topics by asking "What rapport building topic did we chat about in the last call with ACME".\n\n'
|
||||
"Know Every Product Update\n"
|
||||
"It is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question "
|
||||
"that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an "
|
||||
"authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the "
|
||||
"prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live "
|
||||
'on call because of how fast accessing information becomes. A question like "Have we shipped the Microsoft AD '
|
||||
'integration yet?" can now be answered in seconds meaning that prospects can get answers while on the call instead of '
|
||||
"asynchronously and sales cycles are reduced as a result."
|
||||
)
|
||||
|
||||
operations_title = "Operations"
|
||||
operations = (
|
||||
"Double the productivity of your Ops teams like IT, HR, etc.\n\n"
|
||||
"Automatically Resolve Tickets\n"
|
||||
"Modern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits "
|
||||
"details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to "
|
||||
"do the real impactful work of landing star candidates or improving your internal processes.\n\n"
|
||||
"AI Aided Onboarding\n"
|
||||
"One of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens "
|
||||
"of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to "
|
||||
"set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help "
|
||||
"of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to "
|
||||
"focus on moving the needle."
|
||||
)
|
||||
|
||||
# For simplicity, we're not adding any metadata suffix here. Generally there is none for the Web connector anyway
|
||||
overview_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/overview",
|
||||
title=overview_title,
|
||||
content=overview,
|
||||
title_embedding=model.encode(f"search_document: {overview_title}"),
|
||||
content_embedding=model.encode(f"search_document: {overview_title}\n{overview}"),
|
||||
)
|
||||
|
||||
enterprise_search_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/enterprise_search",
|
||||
title=enterprise_search_title,
|
||||
content=enterprise_search_1,
|
||||
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
|
||||
content_embedding=model.encode(
|
||||
f"search_document: {enterprise_search_title}\n{enterprise_search_1}"
|
||||
),
|
||||
)
|
||||
|
||||
enterprise_search_doc_2 = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/enterprise_search",
|
||||
title=enterprise_search_title,
|
||||
content=enterprise_search_2,
|
||||
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
|
||||
content_embedding=model.encode(
|
||||
f"search_document: {enterprise_search_title}\n{enterprise_search_2}"
|
||||
),
|
||||
chunk_ind=1,
|
||||
)
|
||||
|
||||
ai_platform_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/ai_platform",
|
||||
title=ai_platform_title,
|
||||
content=ai_platform,
|
||||
title_embedding=model.encode(f"search_document: {ai_platform_title}"),
|
||||
content_embedding=model.encode(
|
||||
f"search_document: {ai_platform_title}\n{ai_platform}"
|
||||
),
|
||||
)
|
||||
|
||||
customer_support_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/customer_support",
|
||||
title=customer_support_title,
|
||||
content=customer_support,
|
||||
title_embedding=model.encode(f"search_document: {customer_support_title}"),
|
||||
content_embedding=model.encode(
|
||||
f"search_document: {customer_support_title}\n{customer_support}"
|
||||
),
|
||||
)
|
||||
|
||||
sales_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/sales",
|
||||
title=sales_title,
|
||||
content=sales,
|
||||
title_embedding=model.encode(f"search_document: {sales_title}"),
|
||||
content_embedding=model.encode(f"search_document: {sales_title}\n{sales}"),
|
||||
)
|
||||
|
||||
operations_doc = SeedPresaveDocument(
|
||||
url="https://docs.danswer.dev/more/use_cases/operations",
|
||||
title=operations_title,
|
||||
content=operations,
|
||||
title_embedding=model.encode(f"search_document: {operations_title}"),
|
||||
content_embedding=model.encode(
|
||||
f"search_document: {operations_title}\n{operations}"
|
||||
),
|
||||
)
|
||||
|
||||
documents = [
|
||||
overview_doc,
|
||||
enterprise_search_doc,
|
||||
enterprise_search_doc_2,
|
||||
ai_platform_doc,
|
||||
customer_support_doc,
|
||||
sales_doc,
|
||||
operations_doc,
|
||||
]
|
||||
|
||||
documents_dict = [doc.model_dump() for doc in documents]
|
||||
|
||||
with open("./backend/danswer/seeding/initial_docs.json", "w") as json_file:
|
||||
json.dump(documents_dict, json_file, indent=4)
|
@ -73,9 +73,7 @@ def _clear_slack_conversation_members(
|
||||
if member_id == admin_user_id:
|
||||
continue
|
||||
try:
|
||||
make_slack_api_call_w_retries(
|
||||
slack_client.conversations_kick, channel=channel_id, user=member_id
|
||||
)
|
||||
slack_client.conversations_kick(channel=channel_id, user=member_id)
|
||||
print(f"Kicked member: {member_id}")
|
||||
except Exception as e:
|
||||
if "cant_kick_self" in str(e):
|
||||
@ -83,9 +81,7 @@ def _clear_slack_conversation_members(
|
||||
print(f"Error kicking member: {e}")
|
||||
print(member_id)
|
||||
try:
|
||||
make_slack_api_call_w_retries(
|
||||
slack_client.conversations_unarchive, channel=channel_id
|
||||
)
|
||||
slack_client.conversations_unarchive(channel=channel_id)
|
||||
channel["is_archived"] = False
|
||||
except Exception:
|
||||
# Channel is already unarchived
|
||||
@ -98,11 +94,7 @@ def _add_slack_conversation_members(
|
||||
channel_id = _get_slack_channel_id(channel)
|
||||
for user_id in member_ids:
|
||||
try:
|
||||
make_slack_api_call_w_retries(
|
||||
slack_client.conversations_invite,
|
||||
channel=channel_id,
|
||||
users=user_id,
|
||||
)
|
||||
slack_client.conversations_invite(channel=channel_id, users=user_id)
|
||||
except Exception as e:
|
||||
if "already_in_channel" in str(e):
|
||||
continue
|
||||
@ -129,11 +121,7 @@ def _delete_slack_conversation_messages(
|
||||
try:
|
||||
if not (ts := message.get("ts")):
|
||||
raise ValueError("Message timestamp is missing")
|
||||
make_slack_api_call_w_retries(
|
||||
slack_client.chat_delete,
|
||||
channel=channel_id,
|
||||
ts=ts,
|
||||
)
|
||||
slack_client.chat_delete(channel=channel_id, ts=ts)
|
||||
except Exception as e:
|
||||
print(f"Error deleting message: {e}")
|
||||
print(message)
|
||||
@ -165,16 +153,12 @@ def _build_slack_channel_from_name(
|
||||
)
|
||||
|
||||
try:
|
||||
channel_response = make_slack_api_call_w_retries(
|
||||
slack_client.conversations_unarchive,
|
||||
channel=channel_response["channel"]["id"],
|
||||
)
|
||||
slack_client.conversations_unarchive(channel=channel_response["channel"]["id"])
|
||||
except Exception:
|
||||
# Channel is already unarchived
|
||||
pass
|
||||
try:
|
||||
channel_response = make_slack_api_call_w_retries(
|
||||
slack_client.conversations_invite,
|
||||
slack_client.conversations_invite(
|
||||
channel=channel_response["channel"]["id"],
|
||||
users=[admin_user_id],
|
||||
)
|
||||
@ -302,10 +286,6 @@ class SlackManager:
|
||||
# "done" in the channel name indicates that this channel is free to be used for a new test
|
||||
new_name = f"done_{str(uuid4())}"
|
||||
try:
|
||||
make_slack_api_call_w_retries(
|
||||
slack_client.conversations_rename,
|
||||
channel=channel["id"],
|
||||
name=new_name,
|
||||
)
|
||||
slack_client.conversations_rename(channel=channel["id"], name=new_name)
|
||||
except SlackApiError as e:
|
||||
print(f"Error renaming channel {channel['id']}: {e}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user