diff --git a/backend/danswer/search/preprocessing/danswer_helper.py b/backend/danswer/search/preprocessing/danswer_helper.py index 88e465dacb53..1802f9eac983 100644 --- a/backend/danswer/search/preprocessing/danswer_helper.py +++ b/backend/danswer/search/preprocessing/danswer_helper.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: def count_unk_tokens(text: str, tokenizer: "AutoTokenizer") -> int: - """Unclear if the wordpiece tokenizer used is actually tokenizing anything as the [UNK] token + """Unclear if the wordpiece/sentencepiece tokenizer used is actually tokenizing anything as the [UNK] token It splits up even foreign characters and unicode emojis without using UNK""" tokenized_text = tokenizer.tokenize(text) num_unk_tokens = len( @@ -73,6 +73,7 @@ def recommend_search_flow( non_stopword_percent = len(non_stopwords) / len(words) # UNK tokens -> suggest Keyword (still may be valid QA) + # TODO do a better job with the classifier model and retire the heuristics if count_unk_tokens(query, get_default_tokenizer(model_name=model_name)) > 0: if not keyword: heuristic_search_type = SearchType.KEYWORD diff --git a/backend/danswer/search/search_nlp_models.py b/backend/danswer/search/search_nlp_models.py index a88e82f2d359..c1a9f91c73da 100644 --- a/backend/danswer/search/search_nlp_models.py +++ b/backend/danswer/search/search_nlp_models.py @@ -40,25 +40,22 @@ def clean_model_name(model_str: str) -> str: return model_str.replace("/", "_").replace("-", "_").replace(".", "_") -# NOTE: If None is used, it may not be using the "correct" tokenizer, for cases -# where this is more important, be sure to refresh with the actual model name -def get_default_tokenizer(model_name: str | None = None) -> "AutoTokenizer": +# NOTE: If no model_name is specified, it may not be using the "correct" tokenizer +# for cases where this is more important, be sure to refresh with the actual model name +# One case where it is not particularly important is in the document chunking flow, +# they're basically all using the sentencepiece tokenizer and whether it's cased or +# uncased does not really matter, they'll all generally end up with the same chunk lengths. +def get_default_tokenizer(model_name: str = DOCUMENT_ENCODER_MODEL) -> "AutoTokenizer": # NOTE: doing a local import here to avoid reduce memory usage caused by # processes importing this file despite not using any of this from transformers import AutoTokenizer # type: ignore global _TOKENIZER - if _TOKENIZER[0] is None or ( - _TOKENIZER[1] is not None and _TOKENIZER[1] != model_name - ): + if _TOKENIZER[0] is None or _TOKENIZER[1] != model_name: if _TOKENIZER[0] is not None: del _TOKENIZER gc.collect() - if model_name is None: - # This could be inaccurate - model_name = DOCUMENT_ENCODER_MODEL - _TOKENIZER = (AutoTokenizer.from_pretrained(model_name), model_name) if hasattr(_TOKENIZER[0], "is_fast") and _TOKENIZER[0].is_fast: @@ -184,6 +181,7 @@ def warm_up_encoders( "https://docs.danswer.dev/quickstart" ) + # May not be the exact same tokenizer used for the indexing flow get_default_tokenizer(model_name=model_name)(warm_up_str) embed_model = EmbeddingModel(