mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 12:30:49 +02:00
Refactor/default indexing embedder (#2073)
* refactor embedding model instantiation * remove unused UNCERTAINTY_PAT constant * typo fixes * fix mypy typing issues * more typing fixes * log attempt.id on dispatch * unnecessary check removed after fixing type
This commit is contained in:
parent
f15d6d2b59
commit
20369fc451
@ -121,13 +121,8 @@ def _run_indexing(
|
||||
primary_index_name=index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
embedding_model = DefaultIndexingEmbedder(
|
||||
model_name=db_embedding_model.model_name,
|
||||
normalize=db_embedding_model.normalize,
|
||||
query_prefix=db_embedding_model.query_prefix,
|
||||
passage_prefix=db_embedding_model.passage_prefix,
|
||||
api_key=db_embedding_model.api_key,
|
||||
provider_type=db_embedding_model.provider_type,
|
||||
embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
|
||||
db_embedding_model
|
||||
)
|
||||
|
||||
indexing_pipeline = build_indexing_pipeline(
|
||||
|
@ -355,6 +355,7 @@ def kickoff_indexing_jobs(
|
||||
secondary_str = " (secondary index)" if use_secondary_index else ""
|
||||
logger.info(
|
||||
f"Indexing dispatched{secondary_str}: "
|
||||
f"attempt_id={attempt.id} "
|
||||
f"connector='{attempt.connector_credential_pair.connector.name}' "
|
||||
f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
|
||||
f"credentials='{attempt.connector_credential_pair.credential_id}'"
|
||||
|
@ -169,7 +169,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
end: datetime,
|
||||
) -> GenerateDocumentsOutput:
|
||||
if self.s3_client is None:
|
||||
raise ConnectorMissingCredentialError("Blog storage")
|
||||
raise ConnectorMissingCredentialError("Blob storage")
|
||||
|
||||
paginator = self.s3_client.get_paginator("list_objects_v2")
|
||||
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix)
|
||||
@ -230,7 +230,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
if self.s3_client is None:
|
||||
raise ConnectorMissingCredentialError("Blog storage")
|
||||
raise ConnectorMissingCredentialError("Blob storage")
|
||||
|
||||
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
||||
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||
|
@ -163,6 +163,19 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
||||
|
||||
return embedded_chunks
|
||||
|
||||
@classmethod
|
||||
def from_db_embedding_model(
|
||||
cls, embedding_model: DbEmbeddingModel
|
||||
) -> "DefaultIndexingEmbedder":
|
||||
return cls(
|
||||
model_name=embedding_model.model_name,
|
||||
normalize=embedding_model.normalize,
|
||||
query_prefix=embedding_model.query_prefix,
|
||||
passage_prefix=embedding_model.passage_prefix,
|
||||
provider_type=embedding_model.provider_type,
|
||||
api_key=embedding_model.api_key,
|
||||
)
|
||||
|
||||
|
||||
def get_embedding_model_from_db_embedding_model(
|
||||
db_session: Session, index_model_status: IndexModelStatus = IndexModelStatus.PRESENT
|
||||
|
@ -267,7 +267,7 @@ def build_indexing_pipeline(
|
||||
chunker: Chunker | None = None,
|
||||
ignore_time_skip: bool = False,
|
||||
) -> IndexingPipelineProtocol:
|
||||
"""Builds a pipline which takes in a list (batch) of docs and indexes them."""
|
||||
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
|
||||
search_settings = get_search_settings()
|
||||
multipass = (
|
||||
search_settings.multipass_indexing
|
||||
|
@ -7,7 +7,6 @@ THOUGHT_PAT = "Thought:"
|
||||
ANSWER_PAT = "Answer:"
|
||||
ANSWERABLE_PAT = "Answerable:"
|
||||
FINAL_ANSWER_PAT = "Final Answer:"
|
||||
UNCERTAINTY_PAT = "?"
|
||||
QUOTE_PAT = "Quote:"
|
||||
QUOTES_PAT_PLURAL = "Quotes:"
|
||||
INVALID_PAT = "Invalid:"
|
||||
|
@ -92,13 +92,8 @@ def upsert_ingestion_doc(
|
||||
|
||||
db_embedding_model = get_current_db_embedding_model(db_session)
|
||||
|
||||
index_embedding_model = DefaultIndexingEmbedder(
|
||||
model_name=db_embedding_model.model_name,
|
||||
normalize=db_embedding_model.normalize,
|
||||
query_prefix=db_embedding_model.query_prefix,
|
||||
passage_prefix=db_embedding_model.passage_prefix,
|
||||
api_key=db_embedding_model.api_key,
|
||||
provider_type=db_embedding_model.provider_type,
|
||||
index_embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
|
||||
db_embedding_model
|
||||
)
|
||||
|
||||
indexing_pipeline = build_indexing_pipeline(
|
||||
@ -130,13 +125,8 @@ def upsert_ingestion_doc(
|
||||
"Secondary index exists but no embedding model configured"
|
||||
)
|
||||
|
||||
new_index_embedding_model = DefaultIndexingEmbedder(
|
||||
model_name=sec_db_embedding_model.model_name,
|
||||
normalize=sec_db_embedding_model.normalize,
|
||||
query_prefix=sec_db_embedding_model.query_prefix,
|
||||
passage_prefix=sec_db_embedding_model.passage_prefix,
|
||||
api_key=sec_db_embedding_model.api_key,
|
||||
provider_type=sec_db_embedding_model.provider_type,
|
||||
new_index_embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
|
||||
sec_db_embedding_model
|
||||
)
|
||||
|
||||
sec_ind_pipeline = build_indexing_pipeline(
|
||||
|
Loading…
x
Reference in New Issue
Block a user