Refactor/default indexing embedder (#2073)

* refactor embedding model instantiation

* remove unused UNCERTAINTY_PAT constant

* typo fixes

* fix mypy typing issues

* more typing fixes

* log attempt.id on dispatch

* unnecessary check removed after fixing type
This commit is contained in:
rkuo-danswer 2024-08-13 14:01:34 -07:00 committed by GitHub
parent f15d6d2b59
commit 20369fc451
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 23 additions and 25 deletions

View File

@ -121,13 +121,8 @@ def _run_indexing(
primary_index_name=index_name, secondary_index_name=None
)
embedding_model = DefaultIndexingEmbedder(
model_name=db_embedding_model.model_name,
normalize=db_embedding_model.normalize,
query_prefix=db_embedding_model.query_prefix,
passage_prefix=db_embedding_model.passage_prefix,
api_key=db_embedding_model.api_key,
provider_type=db_embedding_model.provider_type,
embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
db_embedding_model
)
indexing_pipeline = build_indexing_pipeline(

View File

@ -355,6 +355,7 @@ def kickoff_indexing_jobs(
secondary_str = " (secondary index)" if use_secondary_index else ""
logger.info(
f"Indexing dispatched{secondary_str}: "
f"attempt_id={attempt.id} "
f"connector='{attempt.connector_credential_pair.connector.name}' "
f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"credentials='{attempt.connector_credential_pair.credential_id}'"

View File

@ -169,7 +169,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
end: datetime,
) -> GenerateDocumentsOutput:
if self.s3_client is None:
raise ConnectorMissingCredentialError("Blog storage")
raise ConnectorMissingCredentialError("Blob storage")
paginator = self.s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix)
@ -230,7 +230,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if self.s3_client is None:
raise ConnectorMissingCredentialError("Blog storage")
raise ConnectorMissingCredentialError("Blob storage")
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)

View File

@ -163,6 +163,19 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
return embedded_chunks
@classmethod
def from_db_embedding_model(
cls, embedding_model: DbEmbeddingModel
) -> "DefaultIndexingEmbedder":
return cls(
model_name=embedding_model.model_name,
normalize=embedding_model.normalize,
query_prefix=embedding_model.query_prefix,
passage_prefix=embedding_model.passage_prefix,
provider_type=embedding_model.provider_type,
api_key=embedding_model.api_key,
)
def get_embedding_model_from_db_embedding_model(
db_session: Session, index_model_status: IndexModelStatus = IndexModelStatus.PRESENT

View File

@ -267,7 +267,7 @@ def build_indexing_pipeline(
chunker: Chunker | None = None,
ignore_time_skip: bool = False,
) -> IndexingPipelineProtocol:
"""Builds a pipline which takes in a list (batch) of docs and indexes them."""
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
search_settings = get_search_settings()
multipass = (
search_settings.multipass_indexing

View File

@ -7,7 +7,6 @@ THOUGHT_PAT = "Thought:"
ANSWER_PAT = "Answer:"
ANSWERABLE_PAT = "Answerable:"
FINAL_ANSWER_PAT = "Final Answer:"
UNCERTAINTY_PAT = "?"
QUOTE_PAT = "Quote:"
QUOTES_PAT_PLURAL = "Quotes:"
INVALID_PAT = "Invalid:"

View File

@ -92,13 +92,8 @@ def upsert_ingestion_doc(
db_embedding_model = get_current_db_embedding_model(db_session)
index_embedding_model = DefaultIndexingEmbedder(
model_name=db_embedding_model.model_name,
normalize=db_embedding_model.normalize,
query_prefix=db_embedding_model.query_prefix,
passage_prefix=db_embedding_model.passage_prefix,
api_key=db_embedding_model.api_key,
provider_type=db_embedding_model.provider_type,
index_embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
db_embedding_model
)
indexing_pipeline = build_indexing_pipeline(
@ -130,13 +125,8 @@ def upsert_ingestion_doc(
"Secondary index exists but no embedding model configured"
)
new_index_embedding_model = DefaultIndexingEmbedder(
model_name=sec_db_embedding_model.model_name,
normalize=sec_db_embedding_model.normalize,
query_prefix=sec_db_embedding_model.query_prefix,
passage_prefix=sec_db_embedding_model.passage_prefix,
api_key=sec_db_embedding_model.api_key,
provider_type=sec_db_embedding_model.provider_type,
new_index_embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
sec_db_embedding_model
)
sec_ind_pipeline = build_indexing_pipeline(