added html support to file text extractor (#1611)

This commit is contained in:
hagen-danswer
2024-06-10 15:46:05 -04:00
committed by GitHub
parent e8306b0fa5
commit 180b592afe

View File

@@ -47,6 +47,7 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".xlsx", ".xlsx",
".eml", ".eml",
".epub", ".epub",
".html",
] ]
@@ -287,5 +288,8 @@ def extract_file_text(
elif extension == ".epub": elif extension == ".epub":
return epub_to_text(file) return epub_to_text(file)
elif extension == ".html":
return parse_html_page_basic(file)
else: else:
return file_io_to_text(file) return file_io_to_text(file)