Rework tokenizer (#1957)

This commit is contained in:
hagen-danswer
2024-07-29 23:01:49 -07:00
committed by GitHub
parent 7932e764d6
commit 3938a053aa
23 changed files with 318 additions and 188 deletions

View File

@@ -89,8 +89,18 @@ class CloudEmbedding:
# OpenAI does not seem to provide truncation option, however
# the context lengths used by Danswer currently are smaller than the max token length
# for OpenAI embeddings so it's not a big deal
response = self.client.embeddings.create(input=texts, model=model)
return [embedding.embedding for embedding in response.data]
try:
response = self.client.embeddings.create(input=texts, model=model)
return [embedding.embedding for embedding in response.data]
except Exception as e:
error_string = (
f"Error embedding text with OpenAI: {str(e)} \n"
f"Model: {model} \n"
f"Provider: {self.provider} \n"
f"Texts: {texts}"
)
logger.error(error_string)
raise RuntimeError(error_string)
def _embed_cohere(
self, texts: list[str], model: str | None, embedding_type: str