mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-27 18:22:55 +01:00
* remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering
188 lines
6.2 KiB
Python
188 lines
6.2 KiB
Python
"""
|
|
launch:
|
|
- api server
|
|
- postgres
|
|
- vespa
|
|
- model server (this is only needed so the api server can startup, no embedding is done)
|
|
|
|
Run this script to seed the database with dummy documents.
|
|
Then run test_query_times.py to test query times.
|
|
"""
|
|
import random
|
|
from datetime import datetime
|
|
|
|
from onyx.access.models import DocumentAccess
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.models import Document
|
|
from onyx.db.engine import get_session_context_manager
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.document_index.document_index_utils import get_multipass_config
|
|
from onyx.document_index.vespa.index import VespaIndex
|
|
from onyx.indexing.indexing_pipeline import IndexBatchParams
|
|
from onyx.indexing.models import ChunkEmbedding
|
|
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
|
from onyx.indexing.models import IndexChunk
|
|
from onyx.utils.timing import log_function_time
|
|
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
|
from shared_configs.model_server_models import Embedding
|
|
|
|
TOTAL_DOC_SETS = 8
|
|
TOTAL_ACL_ENTRIES_PER_CATEGORY = 80
|
|
|
|
|
|
def generate_random_embedding(dim: int) -> Embedding:
|
|
return [random.uniform(-1, 1) for _ in range(dim)]
|
|
|
|
|
|
def generate_random_identifier() -> str:
|
|
return f"dummy_doc_{random.randint(1, 1000)}"
|
|
|
|
|
|
def generate_dummy_chunk(
|
|
doc_id: str,
|
|
chunk_id: int,
|
|
embedding_dim: int,
|
|
number_of_acl_entries: int,
|
|
number_of_document_sets: int,
|
|
) -> DocMetadataAwareIndexChunk:
|
|
document = Document(
|
|
id=doc_id,
|
|
source=DocumentSource.GOOGLE_DRIVE,
|
|
sections=[],
|
|
metadata={},
|
|
semantic_identifier=generate_random_identifier(),
|
|
)
|
|
|
|
chunk = IndexChunk(
|
|
chunk_id=chunk_id,
|
|
blurb=f"Blurb for chunk {chunk_id} of document {doc_id}.",
|
|
content=f"Content for chunk {chunk_id} of document {doc_id}. This is dummy text for testing purposes.",
|
|
source_links={},
|
|
section_continuation=False,
|
|
source_document=document,
|
|
title_prefix=f"Title prefix for doc {doc_id}",
|
|
metadata_suffix_semantic="",
|
|
metadata_suffix_keyword="",
|
|
mini_chunk_texts=None,
|
|
embeddings=ChunkEmbedding(
|
|
full_embedding=generate_random_embedding(embedding_dim),
|
|
mini_chunk_embeddings=[],
|
|
),
|
|
title_embedding=generate_random_embedding(embedding_dim),
|
|
large_chunk_id=None,
|
|
large_chunk_reference_ids=[],
|
|
image_file_name=None,
|
|
)
|
|
|
|
document_set_names = []
|
|
for i in range(number_of_document_sets):
|
|
document_set_names.append(f"Document Set {i}")
|
|
|
|
user_emails: set[str | None] = set()
|
|
user_groups: set[str] = set()
|
|
external_user_emails: set[str] = set()
|
|
external_user_group_ids: set[str] = set()
|
|
for i in range(number_of_acl_entries):
|
|
user_emails.add(f"user_{i}@example.com")
|
|
user_groups.add(f"group_{i}")
|
|
external_user_emails.add(f"external_user_{i}@example.com")
|
|
external_user_group_ids.add(f"external_group_{i}")
|
|
|
|
return DocMetadataAwareIndexChunk.from_index_chunk(
|
|
index_chunk=chunk,
|
|
access=DocumentAccess(
|
|
user_emails=user_emails,
|
|
user_groups=user_groups,
|
|
external_user_emails=external_user_emails,
|
|
external_user_group_ids=external_user_group_ids,
|
|
is_public=random.choice([True, False]),
|
|
),
|
|
document_sets={document_set for document_set in document_set_names},
|
|
boost=random.randint(-1, 1),
|
|
aggregated_chunk_boost_factor=random.random(),
|
|
tenant_id=POSTGRES_DEFAULT_SCHEMA,
|
|
)
|
|
|
|
|
|
@log_function_time()
|
|
def do_insertion(
|
|
vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
|
|
) -> None:
|
|
insertion_records = vespa_index.index(
|
|
chunks=all_chunks,
|
|
index_batch_params=IndexBatchParams(
|
|
doc_id_to_previous_chunk_cnt={},
|
|
doc_id_to_new_chunk_cnt={},
|
|
tenant_id=POSTGRES_DEFAULT_SCHEMA,
|
|
large_chunks_enabled=False,
|
|
),
|
|
)
|
|
print(f"Indexed {len(insertion_records)} documents.")
|
|
print(
|
|
f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"
|
|
)
|
|
print(
|
|
f"Existing documents updated: {sum(1 for record in insertion_records if record.already_existed)}"
|
|
)
|
|
|
|
|
|
@log_function_time()
|
|
def seed_dummy_docs(
|
|
number_of_document_sets: int,
|
|
number_of_acl_entries: int,
|
|
num_docs: int = 1000,
|
|
chunks_per_doc: int = 5,
|
|
batch_size: int = 100,
|
|
) -> None:
|
|
with get_session_context_manager() as db_session:
|
|
search_settings = get_current_search_settings(db_session)
|
|
multipass_config = get_multipass_config(search_settings)
|
|
index_name = search_settings.index_name
|
|
embedding_dim = search_settings.final_embedding_dim
|
|
|
|
vespa_index = VespaIndex(
|
|
index_name=index_name,
|
|
secondary_index_name=None,
|
|
large_chunks_enabled=multipass_config.enable_large_chunks,
|
|
secondary_large_chunks_enabled=None,
|
|
)
|
|
print(index_name)
|
|
|
|
all_chunks = []
|
|
chunk_count = 0
|
|
for doc_num in range(num_docs):
|
|
doc_id = f"dummy_doc_{doc_num}_{datetime.now().isoformat()}"
|
|
for chunk_num in range(chunks_per_doc):
|
|
chunk = generate_dummy_chunk(
|
|
doc_id=doc_id,
|
|
chunk_id=chunk_num,
|
|
embedding_dim=embedding_dim,
|
|
number_of_acl_entries=number_of_acl_entries,
|
|
number_of_document_sets=number_of_document_sets,
|
|
)
|
|
all_chunks.append(chunk)
|
|
chunk_count += 1
|
|
|
|
if len(all_chunks) >= chunks_per_doc * batch_size:
|
|
do_insertion(vespa_index, all_chunks)
|
|
print(
|
|
f"Indexed {chunk_count} chunks out of {num_docs * chunks_per_doc}."
|
|
)
|
|
print(
|
|
f"percentage: {chunk_count / (num_docs * chunks_per_doc) * 100:.2f}% \n"
|
|
)
|
|
all_chunks = []
|
|
|
|
if all_chunks:
|
|
do_insertion(vespa_index, all_chunks)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
seed_dummy_docs(
|
|
number_of_document_sets=TOTAL_DOC_SETS,
|
|
number_of_acl_entries=TOTAL_ACL_ENTRIES_PER_CATEGORY,
|
|
num_docs=100000,
|
|
chunks_per_doc=5,
|
|
batch_size=1000,
|
|
)
|