added html support to file text extractor (#1611)

This commit is contained in:
hagen-danswer 2024-06-10 15:46:05 -04:00 committed by GitHub
parent e8306b0fa5
commit 180b592afe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -47,6 +47,7 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".xlsx",
".eml",
".epub",
".html",
]
@ -287,5 +288,8 @@ def extract_file_text(
elif extension == ".epub":
return epub_to_text(file)
elif extension == ".html":
return parse_html_page_basic(file)
else:
return file_io_to_text(file)