mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-22 22:11:03 +02:00
Remove wordnet (#2365)
This commit is contained in:
parent
1555ac9dab
commit
148c2a7375
@ -75,8 +75,8 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
|
|||||||
# Pre-downloading NLTK for setups with limited egress
|
# Pre-downloading NLTK for setups with limited egress
|
||||||
RUN python -c "import nltk; \
|
RUN python -c "import nltk; \
|
||||||
nltk.download('stopwords', quiet=True); \
|
nltk.download('stopwords', quiet=True); \
|
||||||
nltk.download('wordnet', quiet=True); \
|
|
||||||
nltk.download('punkt', quiet=True);"
|
nltk.download('punkt', quiet=True);"
|
||||||
|
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
|
||||||
|
|
||||||
# Set up application files
|
# Set up application files
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
@ -3,7 +3,6 @@ from collections.abc import Callable
|
|||||||
|
|
||||||
import nltk # type:ignore
|
import nltk # type:ignore
|
||||||
from nltk.corpus import stopwords # type:ignore
|
from nltk.corpus import stopwords # type:ignore
|
||||||
from nltk.stem import WordNetLemmatizer # type:ignore
|
|
||||||
from nltk.tokenize import word_tokenize # type:ignore
|
from nltk.tokenize import word_tokenize # type:ignore
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
@ -40,7 +39,7 @@ logger = setup_logger()
|
|||||||
def download_nltk_data() -> None:
|
def download_nltk_data() -> None:
|
||||||
resources = {
|
resources = {
|
||||||
"stopwords": "corpora/stopwords",
|
"stopwords": "corpora/stopwords",
|
||||||
"wordnet": "corpora/wordnet",
|
# "wordnet": "corpora/wordnet", # Not in use
|
||||||
"punkt": "tokenizers/punkt",
|
"punkt": "tokenizers/punkt",
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,15 +57,16 @@ def download_nltk_data() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def lemmatize_text(keywords: list[str]) -> list[str]:
|
def lemmatize_text(keywords: list[str]) -> list[str]:
|
||||||
try:
|
raise NotImplementedError("Lemmatization should not be used currently")
|
||||||
query = " ".join(keywords)
|
# try:
|
||||||
lemmatizer = WordNetLemmatizer()
|
# query = " ".join(keywords)
|
||||||
word_tokens = word_tokenize(query)
|
# lemmatizer = WordNetLemmatizer()
|
||||||
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
|
# word_tokens = word_tokenize(query)
|
||||||
combined_keywords = list(set(keywords + lemmatized_words))
|
# lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
|
||||||
return combined_keywords
|
# combined_keywords = list(set(keywords + lemmatized_words))
|
||||||
except Exception:
|
# return combined_keywords
|
||||||
return keywords
|
# except Exception:
|
||||||
|
# return keywords
|
||||||
|
|
||||||
|
|
||||||
def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
|
def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user