From c91a5d8b1fd36827f1b6e45ffc7ef9d36780a280 Mon Sep 17 00:00:00 2001 From: Doug Danat Date: Mon, 25 Mar 2024 11:26:18 +0100 Subject: [PATCH] switch to using BeautifulSoup HTML loader so title is also captured --- backend/apps/rag/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 163f1b0fa..1e50ef20b 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,7 +21,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, - UnstructuredHTMLLoader, + BSHTMLLoader, Docx2txtLoader, UnstructuredEPubLoader, UnstructuredWordDocumentLoader, @@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) elif file_ext in ["htm", "html"]: - loader = UnstructuredHTMLLoader(file_path) + loader = BSHTMLLoader(file_path) elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip":