Handle Empty Titles (#1891)

This commit is contained in:
Yuhong Sun
2024-07-21 14:59:23 -07:00
committed by GitHub
parent 581ffde35a
commit 5cfed45cef
3 changed files with 22 additions and 9 deletions

View File

@@ -100,15 +100,16 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
# Drop any None or empty strings
chunk_titles_list = [title for title in chunk_titles if title]
title_embeddings = self.embedding_model.encode(
chunk_titles_list, text_type=EmbedTextType.PASSAGE
)
title_embed_dict.update(
{
title: vector
for title, vector in zip(chunk_titles_list, title_embeddings)
}
)
if chunk_titles_list:
title_embeddings = self.embedding_model.encode(
chunk_titles_list, text_type=EmbedTextType.PASSAGE
)
title_embed_dict.update(
{
title: vector
for title, vector in zip(chunk_titles_list, title_embeddings)
}
)
# Mapping embeddings to chunks
embedding_ind_start = 0

View File

@@ -112,6 +112,10 @@ class EmbeddingModel:
text_type: EmbedTextType,
batch_size: int = BATCH_SIZE_ENCODE_CHUNKS,
) -> list[list[float]]:
if not texts:
logger.warning("No texts to be embedded")
return []
if self.provider_type:
embed_request = EmbedRequest(
model_name=self.model_name,

View File

@@ -284,6 +284,9 @@ def calc_sim_scores(query: str, docs: list[str]) -> list[list[float]]:
async def process_embed_request(
embed_request: EmbedRequest,
) -> EmbedResponse:
if not embed_request.texts:
raise HTTPException(status_code=400, detail="No texts to be embedded")
try:
if embed_request.text_type == EmbedTextType.QUERY:
prefix = embed_request.manual_query_prefix
@@ -315,6 +318,11 @@ async def process_rerank_request(embed_request: RerankRequest) -> RerankResponse
if INDEXING_ONLY:
raise RuntimeError("Indexing model server should not call intent endpoint")
if not embed_request.documents or not embed_request.query:
raise HTTPException(
status_code=400, detail="No documents or query to be reranked"
)
try:
sim_scores = calc_sim_scores(
query=embed_request.query, docs=embed_request.documents