Remove wordnet (#2365)

This commit is contained in:
Yuhong Sun 2024-09-08 12:34:09 -07:00 committed by GitHub
parent 1555ac9dab
commit 148c2a7375
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 12 deletions

View File

@ -75,8 +75,8 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
# Pre-downloading NLTK for setups with limited egress # Pre-downloading NLTK for setups with limited egress
RUN python -c "import nltk; \ RUN python -c "import nltk; \
nltk.download('stopwords', quiet=True); \ nltk.download('stopwords', quiet=True); \
nltk.download('wordnet', quiet=True); \
nltk.download('punkt', quiet=True);" nltk.download('punkt', quiet=True);"
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
# Set up application files # Set up application files
WORKDIR /app WORKDIR /app

View File

@ -3,7 +3,6 @@ from collections.abc import Callable
import nltk # type:ignore import nltk # type:ignore
from nltk.corpus import stopwords # type:ignore from nltk.corpus import stopwords # type:ignore
from nltk.stem import WordNetLemmatizer # type:ignore
from nltk.tokenize import word_tokenize # type:ignore from nltk.tokenize import word_tokenize # type:ignore
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -40,7 +39,7 @@ logger = setup_logger()
def download_nltk_data() -> None: def download_nltk_data() -> None:
resources = { resources = {
"stopwords": "corpora/stopwords", "stopwords": "corpora/stopwords",
"wordnet": "corpora/wordnet", # "wordnet": "corpora/wordnet", # Not in use
"punkt": "tokenizers/punkt", "punkt": "tokenizers/punkt",
} }
@ -58,15 +57,16 @@ def download_nltk_data() -> None:
def lemmatize_text(keywords: list[str]) -> list[str]: def lemmatize_text(keywords: list[str]) -> list[str]:
try: raise NotImplementedError("Lemmatization should not be used currently")
query = " ".join(keywords) # try:
lemmatizer = WordNetLemmatizer() # query = " ".join(keywords)
word_tokens = word_tokenize(query) # lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens] # word_tokens = word_tokenize(query)
combined_keywords = list(set(keywords + lemmatized_words)) # lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
return combined_keywords # combined_keywords = list(set(keywords + lemmatized_words))
except Exception: # return combined_keywords
return keywords # except Exception:
# return keywords
def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]: def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]: