Seeding (#2902)

* checkpoint * k * k * k * fixed slack api calls * missed one --------- Co-authored-by: hagen-danswer <hagen@danswer.ai>
2025-06-25 15:30:59 +02:00 · 2024-10-24 16:45:48 -07:00 · 2024-10-24 16:45:48 -07:00 · b49a9ab171
commit b49a9ab171
parent 9f50417109
24 changed files with 11372 additions and 50 deletions
--- a/backend/danswer/access/models.py
+++ b/backend/danswer/access/models.py
@ -70,3 +70,12 @@ class DocumentAccess(ExternalAccess):
            user_groups=set(user_groups),
            is_public=is_public,
        )
+
+
+default_public_access = DocumentAccess(
+    external_user_emails=set(),
+    external_user_group_ids=set(),
+    user_emails=set(),
+    user_groups=set(),
+    is_public=True,
+)
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -70,6 +70,7 @@ KV_CUSTOMER_UUID_KEY = "customer_uuid"
 KV_INSTANCE_DOMAIN_KEY = "instance_domain"
 KV_ENTERPRISE_SETTINGS_KEY = "danswer_enterprise_settings"
 KV_CUSTOM_ANALYTICS_SCRIPT_KEY = "__custom_analytics_script__"
+KV_DOCUMENTS_SEEDED_KEY = "documents_seeded"

 CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 60
 CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
--- a/backend/danswer/connectors/axero/connector.py
+++ b/backend/danswer/connectors/axero/connector.py
@ -15,7 +15,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
 )
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.file_processing.html_utils import parse_html_page_basic
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder


 logger = setup_logger()
--- a/backend/danswer/connectors/clickup/connector.py
+++ b/backend/danswer/connectors/clickup/connector.py
@ -10,7 +10,6 @@ from danswer.configs.constants import DocumentSource
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
 )
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@ -19,6 +18,7 @@ from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
+from danswer.utils.retry_wrapper import retry_builder


 CLICKUP_API_BASE_URL = "https://api.clickup.com/api/v2"
--- a/backend/danswer/connectors/discourse/connector.py
+++ b/backend/danswer/connectors/discourse/connector.py
@ -14,7 +14,6 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
 )
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
@ -24,6 +23,7 @@ from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.file_processing.html_utils import parse_html_page_basic
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder

 logger = setup_logger()

--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@ -11,7 +11,6 @@ from danswer.configs.constants import DocumentSource
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
 )
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.document360.utils import flatten_child_categories
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
@ -22,6 +21,7 @@ from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.file_processing.html_utils import parse_html_page_basic
+from danswer.utils.retry_wrapper import retry_builder

 # Limitations and Potential Improvements
 # 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@ -19,7 +19,6 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import IGNORE_FOR_QA
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.google_drive.connector_auth import get_google_drive_creds
 from danswer.connectors.google_drive.constants import (
    DB_CREDENTIALS_DICT_DELEGATED_USER_KEY,
@ -40,6 +39,7 @@ from danswer.file_processing.unstructured import get_unstructured_api_key
 from danswer.file_processing.unstructured import unstructured_to_text
 from danswer.utils.batching import batch_generator
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder

 logger = setup_logger()

--- a/backend/danswer/connectors/slack/utils.py
+++ b/backend/danswer/connectors/slack/utils.py
@ -10,9 +10,9 @@ from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 from slack_sdk.web import SlackResponse

-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.models import BasicExpertInfo
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder

 logger = setup_logger()

--- a/backend/danswer/db/connector_credential_pair.py
+++ b/backend/danswer/db/connector_credential_pair.py
@ -341,6 +341,8 @@ def add_credential_to_connector(
    access_type: AccessType,
    groups: list[int] | None,
    auto_sync_options: dict | None = None,
+    initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
+    last_successful_index_time: datetime | None = None,
 ) -> StatusResponse:
    connector = fetch_connector_by_id(connector_id, db_session)
    credential = fetch_credential_by_id(credential_id, user, db_session)
@ -384,9 +386,10 @@ def add_credential_to_connector(
        connector_id=connector_id,
        credential_id=credential_id,
        name=cc_pair_name,
-        status=ConnectorCredentialPairStatus.ACTIVE,
+        status=initial_status,
        access_type=access_type,
        auto_sync_options=auto_sync_options,
+        last_successful_index_time=last_successful_index_time,
    )
    db_session.add(association)
    db_session.flush()  # make sure the association has an id
--- a/backend/danswer/db/credentials.py
+++ b/backend/danswer/db/credentials.py
@ -40,6 +40,8 @@ CREDENTIAL_PERMISSIONS_TO_IGNORE = {
    DocumentSource.MEDIAWIKI,
 }

+PUBLIC_CREDENTIAL_ID = 0
+

 def _add_user_filters(
    stmt: Select,
@ -384,12 +386,11 @@ def delete_credential(


 def create_initial_public_credential(db_session: Session) -> None:
-    public_cred_id = 0
    error_msg = (
        "DB is not in a valid initial state."
        "There must exist an empty public credential for data connectors that do not require additional Auth."
    )
-    first_credential = fetch_credential_by_id(public_cred_id, None, db_session)
+    first_credential = fetch_credential_by_id(PUBLIC_CREDENTIAL_ID, None, db_session)

    if first_credential is not None:
        if first_credential.credential_json != {} or first_credential.user is not None:
@ -397,7 +398,7 @@ def create_initial_public_credential(db_session: Session) -> None:
        return

    credential = Credential(
-        id=public_cred_id,
+        id=PUBLIC_CREDENTIAL_ID,
        credential_json={},
        user_id=None,
    )
--- a/backend/danswer/db/index_attempt.py
+++ b/backend/danswer/db/index_attempt.py
@ -1,5 +1,6 @@
 from collections.abc import Sequence
 from datetime import datetime
+from datetime import timedelta
 from datetime import timezone

 from sqlalchemy import and_
@ -66,6 +67,32 @@ def create_index_attempt(
    return new_attempt.id


+def mock_successful_index_attempt(
+    connector_credential_pair_id: int,
+    search_settings_id: int,
+    docs_indexed: int,
+    db_session: Session,
+) -> int:
+    """Should not be used in any user triggered flows"""
+    db_time = func.now()
+    new_attempt = IndexAttempt(
+        connector_credential_pair_id=connector_credential_pair_id,
+        search_settings_id=search_settings_id,
+        from_beginning=True,
+        status=IndexingStatus.SUCCESS,
+        total_docs_indexed=docs_indexed,
+        new_docs_indexed=docs_indexed,
+        # Need this to be some convincing random looking value and it can't be 0
+        # or the indexing rate would calculate out to infinity
+        time_started=db_time - timedelta(seconds=1.92),
+        time_updated=db_time,
+    )
+    db_session.add(new_attempt)
+    db_session.commit()
+
+    return new_attempt.id
+
+
 def get_in_progress_index_attempts(
    connector_id: int | None,
    db_session: Session,
--- a/backend/danswer/indexing/embedder.py
+++ b/backend/danswer/indexing/embedder.py
@ -103,6 +103,9 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
        self,
        chunks: list[DocAwareChunk],
    ) -> list[IndexChunk]:
+        """Adds embeddings to the chunks, the title and metadata suffixes are added to the chunk as well
+        if they exist. If there is no space for it, it would have been thrown out at the chunking step.
+        """
        # All chunks at this point must have some non-empty content
        flat_chunk_texts: list[str] = []
        large_chunks_present = False
@ -121,6 +124,11 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
            flat_chunk_texts.append(chunk_text)

            if chunk.mini_chunk_texts:
+                if chunk.large_chunk_reference_ids:
+                    # A large chunk does not contain mini chunks, if it matches the large chunk
+                    # with a high score, then mini chunks would not be used anyway
+                    # otherwise it should match the normal chunk
+                    raise RuntimeError("Large chunk contains mini chunks")
                flat_chunk_texts.extend(chunk.mini_chunk_texts)

        embeddings = self.embedding_model.encode(
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@ -195,6 +195,8 @@ def index_doc_batch_prepare(
    db_session: Session,
    ignore_time_skip: bool = False,
 ) -> DocumentBatchPrepareContext | None:
+    """This sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
+    This preceeds indexing it into the actual document index."""
    documents = []
    for document in document_batch:
        empty_contents = not any(section.text.strip() for section in document.sections)
--- a/backend/danswer/key_value_store/factory.py
+++ b/backend/danswer/key_value_store/factory.py
@ -3,5 +3,6 @@ from danswer.key_value_store.store import PgRedisKVStore


 def get_kv_store() -> KeyValueStore:
-    # this is the only one supported currently
+    # In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
+    # It's read from the global thread level variable
    return PgRedisKVStore()
--- a/backend/danswer/key_value_store/interface.py
+++ b/backend/danswer/key_value_store/interface.py
@ -14,6 +14,8 @@ class KvKeyNotFoundError(Exception):


 class KeyValueStore:
+    # In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
+    # It's read from the global thread level variable
    @abc.abstractmethod
    def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None:
        raise NotImplementedError
--- a/backend/danswer/seeding/init.py
+++ b/backend/danswer/seeding/init.py
--- a/backend/danswer/seeding/initial_docs.json
+++ b/backend/danswer/seeding/initial_docs.json
--- a/backend/danswer/seeding/load_docs.py
+++ b/backend/danswer/seeding/load_docs.py
@ -0,0 +1,212 @@
+import datetime
+import json
+import os
+from typing import cast
+
+from sqlalchemy.orm import Session
+
+from danswer.access.models import default_public_access
+from danswer.configs.constants import DEFAULT_BOOST
+from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import KV_DOCUMENTS_SEEDED_KEY
+from danswer.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
+from danswer.connectors.models import Document
+from danswer.connectors.models import IndexAttemptMetadata
+from danswer.connectors.models import InputType
+from danswer.connectors.models import Section
+from danswer.db.connector import check_connectors_exist
+from danswer.db.connector import create_connector
+from danswer.db.connector_credential_pair import add_credential_to_connector
+from danswer.db.credentials import PUBLIC_CREDENTIAL_ID
+from danswer.db.document import check_docs_exist
+from danswer.db.enums import AccessType
+from danswer.db.enums import ConnectorCredentialPairStatus
+from danswer.db.index_attempt import mock_successful_index_attempt
+from danswer.db.search_settings import get_current_search_settings
+from danswer.document_index.factory import get_default_document_index
+from danswer.indexing.indexing_pipeline import index_doc_batch_prepare
+from danswer.indexing.models import ChunkEmbedding
+from danswer.indexing.models import DocMetadataAwareIndexChunk
+from danswer.key_value_store.factory import get_kv_store
+from danswer.key_value_store.interface import KvKeyNotFoundError
+from danswer.server.documents.models import ConnectorBase
+from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder
+
+
+logger = setup_logger()
+
+
+def _create_indexable_chunks(
+    preprocessed_docs: list[dict],
+) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
+    ids_to_documents = {}
+    chunks = []
+    for preprocessed_doc in preprocessed_docs:
+        document = Document(
+            id=preprocessed_doc["url"],  # For Web connector, the URL is the ID
+            # The section is not really used past this point since we have already done the other processing
+            # for the chunking and embedding.
+            sections=[
+                Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"])
+            ],
+            source=DocumentSource.WEB,
+            semantic_identifier=preprocessed_doc["title"],
+            metadata={},
+            doc_updated_at=None,
+            primary_owners=[],
+            secondary_owners=[],
+        )
+        if preprocessed_doc["chunk_ind"] == 0:
+            ids_to_documents[document.id] = document
+
+        chunk = DocMetadataAwareIndexChunk(
+            chunk_id=preprocessed_doc["chunk_ind"],
+            blurb=preprocessed_doc["content"]
+            .split(".", 1)[0]
+            .split("!", 1)[0]
+            .split("?", 1)[0],
+            content=preprocessed_doc["content"],
+            source_links={0: preprocessed_doc["url"]},
+            section_continuation=False,
+            source_document=document,
+            title_prefix=preprocessed_doc["title"],
+            metadata_suffix_semantic="",
+            metadata_suffix_keyword="",
+            mini_chunk_texts=None,
+            large_chunk_reference_ids=[],
+            embeddings=ChunkEmbedding(
+                full_embedding=preprocessed_doc["content_embedding"],
+                mini_chunk_embeddings=[],
+            ),
+            title_embedding=preprocessed_doc["title_embedding"],
+            tenant_id=None,
+            access=default_public_access,
+            document_sets=set(),
+            boost=DEFAULT_BOOST,
+        )
+        chunks.append(chunk)
+
+    return list(ids_to_documents.values()), chunks
+
+
+def seed_initial_documents(db_session: Session) -> None:
+    """
+    Seed initial documents so users don't have an empty index to start
+
+    Documents are only loaded if:
+    - This is the first setup (if the user deletes the docs, we don't load them again)
+    - The index is empty, there are no docs and no (non-default) connectors
+    - The user has not updated the embedding models
+        - If they do, then we have to actually index the website
+        - If the embedding model is already updated on server startup, they're not a new user
+
+    Note that regardless of any search settings, the default documents are always loaded with
+    the predetermined chunk sizes and single pass embedding.
+
+    Steps are as follows:
+    - Check if this needs to run
+    - Create the connector representing this
+    - Create the cc-pair (attaching the public credential) and mocking values like the last success
+    - Indexing the documents into Postgres
+    - Indexing the documents into Vespa
+    - Create a fake index attempt with fake times
+    """
+    logger.info("Seeding initial documents")
+
+    kv_store = get_kv_store()
+    try:
+        kv_store.load(KV_DOCUMENTS_SEEDED_KEY)
+        logger.info("Documents already seeded, skipping")
+        return
+    except KvKeyNotFoundError:
+        pass
+
+    if check_docs_exist(db_session):
+        logger.info("Documents already exist, skipping")
+        return
+
+    if check_connectors_exist(db_session):
+        logger.info("Connectors already exist, skipping")
+        return
+
+    search_settings = get_current_search_settings(db_session)
+    if search_settings.model_name != DEFAULT_DOCUMENT_ENCODER_MODEL:
+        logger.info("Embedding model has been updated, skipping")
+        return
+
+    document_index = get_default_document_index(
+        primary_index_name=search_settings.index_name, secondary_index_name=None
+    )
+
+    # Create a connector so the user can delete it if they want
+    # or reindex it with a new search model if they want
+    connector_data = ConnectorBase(
+        name="Sample Use Cases",
+        source=DocumentSource.WEB,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={
+            "base_url": "https://docs.danswer.dev/more/use_cases",
+            "web_connector_type": "recursive",
+        },
+        refresh_freq=None,  # Never refresh by default
+        prune_freq=None,
+        indexing_start=None,
+    )
+
+    connector = create_connector(db_session, connector_data)
+    connector_id = cast(int, connector.id)
+
+    last_index_time = datetime.datetime.now(datetime.timezone.utc)
+
+    result = add_credential_to_connector(
+        db_session=db_session,
+        user=None,
+        connector_id=connector_id,
+        credential_id=PUBLIC_CREDENTIAL_ID,
+        access_type=AccessType.PUBLIC,
+        cc_pair_name=connector_data.name,
+        groups=None,
+        initial_status=ConnectorCredentialPairStatus.PAUSED,
+        last_successful_index_time=last_index_time,
+    )
+    cc_pair_id = cast(int, result.data)
+
+    initial_docs_path = os.path.join(
+        os.getcwd(), "danswer", "seeding", "initial_docs.json"
+    )
+    processed_docs = json.load(open(initial_docs_path))
+
+    docs, chunks = _create_indexable_chunks(processed_docs)
+
+    index_doc_batch_prepare(
+        document_batch=docs,
+        index_attempt_metadata=IndexAttemptMetadata(
+            connector_id=connector_id,
+            credential_id=PUBLIC_CREDENTIAL_ID,
+        ),
+        db_session=db_session,
+        ignore_time_skip=True,  # Doesn't actually matter here
+    )
+
+    # In this case since there are no other connectors running in the background
+    # and this is a fresh deployment, there is no need to grab any locks
+    logger.info(
+        "Indexing seeding documents into Vespa "
+        "(Vespa may take a few seconds to become ready after receiving the schema)"
+    )
+
+    # Retries here because the index may take a few seconds to become ready
+    # as we just sent over the Vespa schema and there is a slight delay
+    index_with_retries = retry_builder()(document_index.index)
+    index_with_retries(chunks=chunks)
+
+    # Mock a run for the UI even though it did not actually call out to anything
+    mock_successful_index_attempt(
+        connector_credential_pair_id=cc_pair_id,
+        search_settings_id=search_settings.id,
+        docs_indexed=len(docs),
+        db_session=db_session,
+    )
+
+    kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True)
--- a/backend/danswer/setup.py
+++ b/backend/danswer/setup.py
@ -40,6 +40,7 @@ from danswer.natural_language_processing.search_nlp_models import warm_up_bi_enc
 from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder
 from danswer.search.models import SavedSearchSettings
 from danswer.search.retrieval.search_runner import download_nltk_data
+from danswer.seeding.load_docs import seed_initial_documents
 from danswer.server.manage.llm.models import LLMProviderUpsertRequest
 from danswer.server.settings.store import load_settings
 from danswer.server.settings.store import store_settings
@ -58,6 +59,12 @@ logger = setup_logger()


 def setup_danswer(db_session: Session) -> None:
+    """
+    Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema
+    on server startup. In the MT case, it will be called when the tenant is created.
+
+    The Tenant Service calls the tenants/create endpoint which runs this.
+    """
    check_index_swap(db_session=db_session)
    search_settings = get_current_search_settings(db_session)
    secondary_search_settings = get_secondary_search_settings(db_session)
@ -107,7 +114,8 @@ def setup_danswer(db_session: Session) -> None:
    if not MULTI_TENANT:
        mark_reindex_flag(db_session)

-    # ensure Vespa is setup correctly
+    # Ensure Vespa is setup correctly, this step is relatively near the end because Vespa
+    # takes a bit of time to start up
    logger.notice("Verifying Document Index(s) is/are available.")
    document_index = get_default_document_index(
        primary_index_name=search_settings.index_name,
@ -139,6 +147,8 @@ def setup_danswer(db_session: Session) -> None:
    # update multipass indexing setting based on GPU availability
    update_default_multipass_indexing(db_session)

+    seed_initial_documents(db_session)
+

 def translate_saved_search_settings(db_session: Session) -> None:
    kv_store = get_kv_store()
@ -311,11 +321,13 @@ def update_default_multipass_indexing(db_session: Session) -> None:


 def setup_multitenant_danswer() -> None:
+    # For Managed Vespa, the schema is sent over via the Vespa Console manually.
    if not MANAGED_VESPA:
        setup_vespa_multitenant(SUPPORTED_EMBEDDING_MODELS)


 def setup_vespa_multitenant(supported_indices: list[SupportedEmbeddingModel]) -> bool:
+    # This is for local testing
    WAIT_SECONDS = 5
    VESPA_ATTEMPTS = 5
    for x in range(VESPA_ATTEMPTS):
--- a/backend/danswer/connectors/cross_connector_utils/retry_wrapper.py
+++ b/backend/danswer/connectors/cross_connector_utils/retry_wrapper.py
@ -22,18 +22,18 @@ def retry_builder(
    jitter: tuple[float, float] | float = 1,
 ) -> Callable[[F], F]:
    """Builds a generic wrapper/decorator for calls to external APIs that
-    may fail due to rate limiting, flakes, or other reasons. Applies expontential
+    may fail due to rate limiting, flakes, or other reasons. Applies exponential
    backoff with jitter to retry the call."""

-    @retry(
-        tries=tries,
-        delay=delay,
-        max_delay=max_delay,
-        backoff=backoff,
-        jitter=jitter,
-        logger=cast(Logger, logger),
-    )
    def retry_with_default(func: F) -> F:
+        @retry(
+            tries=tries,
+            delay=delay,
+            max_delay=max_delay,
+            backoff=backoff,
+            jitter=jitter,
+            logger=cast(Logger, logger),
+        )
        def wrapped_func(*args: list, **kwargs: dict[str, Any]) -> Any:
            return func(*args, **kwargs)

--- a/backend/ee/danswer/external_permissions/google_drive/doc_sync.py
+++ b/backend/ee/danswer/external_permissions/google_drive/doc_sync.py
@ -9,7 +9,6 @@ from googleapiclient.errors import HttpError  # type: ignore
 from sqlalchemy.orm import Session

 from danswer.access.models import ExternalAccess
-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.factory import instantiate_connector
 from danswer.connectors.google_drive.connector_auth import (
    get_google_drive_creds,
@ -19,6 +18,7 @@ from danswer.connectors.models import InputType
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder
 from ee.danswer.db.document import upsert_document_external_perms__no_commit

 # Google Drive APIs are quite flakey and may 500 for an
--- a/backend/ee/danswer/external_permissions/google_drive/group_sync.py
+++ b/backend/ee/danswer/external_permissions/google_drive/group_sync.py
@ -7,7 +7,6 @@ from googleapiclient.discovery import build  # type: ignore
 from googleapiclient.errors import HttpError  # type: ignore
 from sqlalchemy.orm import Session

-from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
 from danswer.connectors.google_drive.connector_auth import (
    get_google_drive_creds,
 )
@ -15,6 +14,7 @@ from danswer.connectors.google_drive.constants import FETCH_GROUPS_SCOPES
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
 from danswer.utils.logger import setup_logger
+from danswer.utils.retry_wrapper import retry_builder
 from ee.danswer.db.external_perm import ExternalUserGroup
 from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair__no_commit

--- a/backend/scripts/document_seeding_prep.py
+++ b/backend/scripts/document_seeding_prep.py
@ -0,0 +1,240 @@
+# This script preps the documents used for initially seeding the index. It handles the embedding so that the
+# documents can be added to the index with minimal processing.
+import json
+
+from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer  # type: ignore
+
+
+class SeedPresaveDocument(BaseModel):
+    url: str
+    title: str
+    content: str
+    title_embedding: list[float]
+    content_embedding: list[float]
+    chunk_ind: int = 0
+
+
+# Be sure to use the default embedding model
+model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
+tokenizer = model.tokenizer
+
+# This is easier than cleaning up the crawl, needs to be updated if the sites are changed
+overview_title = "Use Cases Overview"
+overview = (
+    "How to leverage Danswer in your organization\n\n"
+    "Danswer Overview\n"
+    "Danswer is the AI Assistant connected to your organization's docs, apps, and people. "
+    "Danswer makes Generative AI more versatile for work by enabling new types of questions like "
+    '"What is the most common feature request we\'ve heard from customers this month". '
+    "Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, "
+    "Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\n"
+    "Danswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. "
+    "The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\n"
+    "Common Getting Started Questions:\n\n"
+    "Why are these docs connected in my Danswer deployment?\n"
+    "Answer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge "
+    "and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date "
+    "and in sync with your connected applications.\n\n"
+    "Is my data being sent anywhere when I connect it up to Danswer?\n"
+    "Answer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know "
+    "exactly what is going on with their data. By default all of the document processing happens within Danswer. "
+    "The only time it is sent outward is for the GenAI call to generate answers.\n\n"
+    "Where is the feature for auto sync-ing document level access permissions from all connected sources?\n"
+    "Answer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. "
+    "If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the "
+    "Danswer team to receive access."
+)
+
+enterprise_search_title = "Enterprise Search"
+enterprise_search_1 = (
+    "Value of Enterprise Search with Danswer\n\n"
+    "What is Enterprise Search and why is it Important?\n"
+    "An Enterprise Search system gives team members a single place to access all of the disparate knowledge "
+    "of an organization. Critical information is saved across a host of channels like call transcripts with "
+    "prospects, engineering design docs, IT runbooks, customer support email exchanges, project management "
+    "tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\n"
+    "Since it quickly becomes infeasible to check across every source, decisions get made on incomplete "
+    "information, employee satisfaction decreases, and the most valuable members of your team are tied up "
+    "with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this "
+    "problem by letting anyone on the team access all of the knowledge across your organization in a "
+    "permissioned and secure way. Users can ask questions in natural language and get back answers and "
+    "documents across all of the connected sources instantly.\n\n"
+    "What's the real cost?\n"
+    "A typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of "
+    "incomplete or incorrect information can be extremely high. Customer support/success that isn't able "
+    "to find the reference to similar cases could cause hours or even days of delay leading to lower "
+    "customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had "
+    "previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar "
+    "feature had previously been built could result in weeks of wasted development time and tech debt with "
+    "duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark "
+    "- inefficient and mistake prone."
+)
+
+enterprise_search_2 = (
+    "More than Search\n"
+    "When analyzing the entire corpus of knowledge within your company is as easy as asking a question "
+    "in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial "
+    "to identify where knowledge is well documented and where it is lacking. Team members who are centers "
+    "of knowledge can begin to effectively document their expertise since it is no longer being thrown into "
+    "a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\n"
+    "With Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar "
+    "cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even "
+    "the most junior members can understand it. This in turn lets them give the most holistic and technically accurate "
+    "response possible to your customers. On the other end, even the super stars of your sales team will not be able "
+    "to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it "
+    "in mere seconds and give crucial context to help your team close."
+)
+
+ai_platform_title = "AI Platform"
+ai_platform = (
+    "Build AI Agents powered by the knowledge and workflows specific to your organization.\n\n"
+    "Beyond Answers\n"
+    "Agents enabled by generative AI and reasoning capable models are helping teams to automate their work. "
+    "Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, "
+    "handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\n"
+    "Danswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your "
+    "team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\n"
+    "Flexibility and Extensibility\n"
+    "Danswer is open source and completely whitebox. This not only gives transparency to what happens within the system "
+    "but also means that your team can directly modify the source code to suit your unique needs."
+)
+
+customer_support_title = "Customer Support"
+customer_support = (
+    "Help your customer support team instantly answer any question across your entire product.\n\n"
+    "AI Enabled Support\n"
+    "Customer support agents have one of the highest breadth jobs. They field requests that cover the entire surface "
+    "area of the product and need to help your users find success on extremely short timelines. "
+    "Because they're not the same people who designed or built the system, they often lack the depth of understanding "
+    "needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team "
+    "optimize the speed and quality of these critical customer-facing interactions.\n\n"
+    "The Importance of Context\n"
+    "There are two critical components of AI copilots for customer support. The first is that the AI system needs to be "
+    "connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the "
+    "knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as "
+    "pull requests in a code repository. The second critical component is the ability of the AI system to break down "
+    "difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able "
+    "to chat back and forth with the system to build a better understanding.\n\n"
+    "Danswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is "
+    "pulled in constantly so that the information access is always up to date."
+)
+
+sales_title = "Sales"
+sales = (
+    "Keep your team up to date on every conversation and update so they can close.\n\n"
+    "Recall Every Detail\n"
+    "Being able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide "
+    "more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through "
+    'hours of transcripts in preparation for a call, your team can now ask Danswer "What specific features was ACME '
+    "interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, "
+    "Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant "
+    "information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building "
+    'topics by asking "What rapport building topic did we chat about in the last call with ACME".\n\n'
+    "Know Every Product Update\n"
+    "It is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question "
+    "that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an "
+    "authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the "
+    "prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live "
+    'on call because of how fast accessing information becomes. A question like "Have we shipped the Microsoft AD '
+    'integration yet?" can now be answered in seconds meaning that prospects can get answers while on the call instead of '
+    "asynchronously and sales cycles are reduced as a result."
+)
+
+operations_title = "Operations"
+operations = (
+    "Double the productivity of your Ops teams like IT, HR, etc.\n\n"
+    "Automatically Resolve Tickets\n"
+    "Modern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits "
+    "details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to "
+    "do the real impactful work of landing star candidates or improving your internal processes.\n\n"
+    "AI Aided Onboarding\n"
+    "One of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens "
+    "of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to "
+    "set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help "
+    "of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to "
+    "focus on moving the needle."
+)
+
+# For simplicity, we're not adding any metadata suffix here. Generally there is none for the Web connector anyway
+overview_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/overview",
+    title=overview_title,
+    content=overview,
+    title_embedding=model.encode(f"search_document: {overview_title}"),
+    content_embedding=model.encode(f"search_document: {overview_title}\n{overview}"),
+)
+
+enterprise_search_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/enterprise_search",
+    title=enterprise_search_title,
+    content=enterprise_search_1,
+    title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
+    content_embedding=model.encode(
+        f"search_document: {enterprise_search_title}\n{enterprise_search_1}"
+    ),
+)
+
+enterprise_search_doc_2 = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/enterprise_search",
+    title=enterprise_search_title,
+    content=enterprise_search_2,
+    title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
+    content_embedding=model.encode(
+        f"search_document: {enterprise_search_title}\n{enterprise_search_2}"
+    ),
+    chunk_ind=1,
+)
+
+ai_platform_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/ai_platform",
+    title=ai_platform_title,
+    content=ai_platform,
+    title_embedding=model.encode(f"search_document: {ai_platform_title}"),
+    content_embedding=model.encode(
+        f"search_document: {ai_platform_title}\n{ai_platform}"
+    ),
+)
+
+customer_support_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/customer_support",
+    title=customer_support_title,
+    content=customer_support,
+    title_embedding=model.encode(f"search_document: {customer_support_title}"),
+    content_embedding=model.encode(
+        f"search_document: {customer_support_title}\n{customer_support}"
+    ),
+)
+
+sales_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/sales",
+    title=sales_title,
+    content=sales,
+    title_embedding=model.encode(f"search_document: {sales_title}"),
+    content_embedding=model.encode(f"search_document: {sales_title}\n{sales}"),
+)
+
+operations_doc = SeedPresaveDocument(
+    url="https://docs.danswer.dev/more/use_cases/operations",
+    title=operations_title,
+    content=operations,
+    title_embedding=model.encode(f"search_document: {operations_title}"),
+    content_embedding=model.encode(
+        f"search_document: {operations_title}\n{operations}"
+    ),
+)
+
+documents = [
+    overview_doc,
+    enterprise_search_doc,
+    enterprise_search_doc_2,
+    ai_platform_doc,
+    customer_support_doc,
+    sales_doc,
+    operations_doc,
+]
+
+documents_dict = [doc.model_dump() for doc in documents]
+
+with open("./backend/danswer/seeding/initial_docs.json", "w") as json_file:
+    json.dump(documents_dict, json_file, indent=4)
--- a/backend/tests/integration/connector_job_tests/slack/slack_api_utils.py
+++ b/backend/tests/integration/connector_job_tests/slack/slack_api_utils.py
@ -73,9 +73,7 @@ def _clear_slack_conversation_members(
        if member_id == admin_user_id:
            continue
        try:
-            make_slack_api_call_w_retries(
-                slack_client.conversations_kick, channel=channel_id, user=member_id
-            )
+            slack_client.conversations_kick(channel=channel_id, user=member_id)
            print(f"Kicked member: {member_id}")
        except Exception as e:
            if "cant_kick_self" in str(e):
@ -83,9 +81,7 @@ def _clear_slack_conversation_members(
            print(f"Error kicking member: {e}")
            print(member_id)
    try:
-        make_slack_api_call_w_retries(
-            slack_client.conversations_unarchive, channel=channel_id
-        )
+        slack_client.conversations_unarchive(channel=channel_id)
        channel["is_archived"] = False
    except Exception:
        # Channel is already unarchived
@ -98,11 +94,7 @@ def _add_slack_conversation_members(
    channel_id = _get_slack_channel_id(channel)
    for user_id in member_ids:
        try:
-            make_slack_api_call_w_retries(
-                slack_client.conversations_invite,
-                channel=channel_id,
-                users=user_id,
-            )
+            slack_client.conversations_invite(channel=channel_id, users=user_id)
        except Exception as e:
            if "already_in_channel" in str(e):
                continue
@ -129,11 +121,7 @@ def _delete_slack_conversation_messages(
            try:
                if not (ts := message.get("ts")):
                    raise ValueError("Message timestamp is missing")
-                make_slack_api_call_w_retries(
-                    slack_client.chat_delete,
-                    channel=channel_id,
-                    ts=ts,
-                )
+                slack_client.chat_delete(channel=channel_id, ts=ts)
            except Exception as e:
                print(f"Error deleting message: {e}")
                print(message)
@ -165,16 +153,12 @@ def _build_slack_channel_from_name(
        )

    try:
-        channel_response = make_slack_api_call_w_retries(
-            slack_client.conversations_unarchive,
-            channel=channel_response["channel"]["id"],
-        )
+        slack_client.conversations_unarchive(channel=channel_response["channel"]["id"])
    except Exception:
        # Channel is already unarchived
        pass
    try:
-        channel_response = make_slack_api_call_w_retries(
-            slack_client.conversations_invite,
+        slack_client.conversations_invite(
            channel=channel_response["channel"]["id"],
            users=[admin_user_id],
        )
@ -302,10 +286,6 @@ class SlackManager:
            # "done" in the channel name indicates that this channel is free to be used for a new test
            new_name = f"done_{str(uuid4())}"
            try:
-                make_slack_api_call_w_retries(
-                    slack_client.conversations_rename,
-                    channel=channel["id"],
-                    name=new_name,
-                )
+                slack_client.conversations_rename(channel=channel["id"], name=new_name)
            except SlackApiError as e:
                print(f"Error renaming channel {channel['id']}: {e}")