added html support to file text extractor (#1611)

2025-10-09 20:55:06 +02:00 · 2024-06-10 15:46:05 -04:00
parent e8306b0fa5
commit 180b592afe
1 changed files with 4 additions and 0 deletions
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -47,6 +47,7 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".xlsx",
    ".eml",
    ".epub",
    ".html",
 ]
@@ -287,5 +288,8 @@ def extract_file_text(
    elif extension == ".epub":
        return epub_to_text(file)
    elif extension == ".html":
        return parse_html_page_basic(file)
    else:
        return file_io_to_text(file)