From 148c2a7375165eaaa90af2f9368545b439ad5e2c Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sun, 8 Sep 2024 12:34:09 -0700 Subject: [PATCH] Remove wordnet (#2365) --- backend/Dockerfile | 2 +- .../danswer/search/retrieval/search_runner.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/backend/Dockerfile b/backend/Dockerfile index 17e0be8c2..fc7bcc586 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -75,8 +75,8 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')" # Pre-downloading NLTK for setups with limited egress RUN python -c "import nltk; \ nltk.download('stopwords', quiet=True); \ -nltk.download('wordnet', quiet=True); \ nltk.download('punkt', quiet=True);" +# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed # Set up application files WORKDIR /app diff --git a/backend/danswer/search/retrieval/search_runner.py b/backend/danswer/search/retrieval/search_runner.py index 31582f908..30347464f 100644 --- a/backend/danswer/search/retrieval/search_runner.py +++ b/backend/danswer/search/retrieval/search_runner.py @@ -3,7 +3,6 @@ from collections.abc import Callable import nltk # type:ignore from nltk.corpus import stopwords # type:ignore -from nltk.stem import WordNetLemmatizer # type:ignore from nltk.tokenize import word_tokenize # type:ignore from sqlalchemy.orm import Session @@ -40,7 +39,7 @@ logger = setup_logger() def download_nltk_data() -> None: resources = { "stopwords": "corpora/stopwords", - "wordnet": "corpora/wordnet", + # "wordnet": "corpora/wordnet", # Not in use "punkt": "tokenizers/punkt", } @@ -58,15 +57,16 @@ def download_nltk_data() -> None: def lemmatize_text(keywords: list[str]) -> list[str]: - try: - query = " ".join(keywords) - lemmatizer = WordNetLemmatizer() - word_tokens = word_tokenize(query) - lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens] - combined_keywords = list(set(keywords + lemmatized_words)) - return combined_keywords - except Exception: - return keywords + raise NotImplementedError("Lemmatization should not be used currently") + # try: + # query = " ".join(keywords) + # lemmatizer = WordNetLemmatizer() + # word_tokens = word_tokenize(query) + # lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens] + # combined_keywords = list(set(keywords + lemmatized_words)) + # return combined_keywords + # except Exception: + # return keywords def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]: