From 784a6ec85e18b9b798fb7292acf9015beae7fada Mon Sep 17 00:00:00 2001
From: Doug Danat <douglass-lewis.danat@s-markt-mehrwert.de>
Date: Mon, 25 Mar 2024 09:50:53 +0100
Subject: [PATCH 1/5] include html langchain loader for RAG

---
 backend/apps/rag/main.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index 48ca61666..163f1b0fa 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
     TextLoader,
     PyPDFLoader,
     CSVLoader,
+    UnstructuredHTMLLoader,
     Docx2txtLoader,
     UnstructuredEPubLoader,
     UnstructuredWordDocumentLoader,
@@ -402,6 +403,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
         loader = UnstructuredRSTLoader(file_path, mode="elements")
     elif file_ext == "xml":
         loader = UnstructuredXMLLoader(file_path)
+    elif file_ext in ["htm", "html"]:
+        loader = UnstructuredHTMLLoader(file_path)
     elif file_ext == "md":
         loader = UnstructuredMarkdownLoader(file_path)
     elif file_content_type == "application/epub+zip":

From 77f4ffddc1ce8cc57ce7227999fc87049c401605 Mon Sep 17 00:00:00 2001
From: Doug Danat <douglass-lewis.danat@s-markt-mehrwert.de>
Date: Mon, 25 Mar 2024 11:21:34 +0100
Subject: [PATCH 2/5] add htm/html to supported extensions in ui

---
 src/lib/constants.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lib/constants.ts b/src/lib/constants.ts
index bdd9c64e9..5adefe561 100644
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [
 	'text/plain',
 	'text/csv',
 	'text/xml',
+	'text/html',
 	'text/x-python',
 	'text/css',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
@@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
 	'h',
 	'c',
 	'cs',
+	'htm',
+	'html',
 	'sql',
 	'log',
 	'ini',

From c91a5d8b1fd36827f1b6e45ffc7ef9d36780a280 Mon Sep 17 00:00:00 2001
From: Doug Danat <douglass-lewis.danat@s-markt-mehrwert.de>
Date: Mon, 25 Mar 2024 11:26:18 +0100
Subject: [PATCH 3/5] switch to using BeautifulSoup HTML loader so title is
 also captured

---
 backend/apps/rag/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index 163f1b0fa..1e50ef20b 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -21,7 +21,7 @@ from langchain_community.document_loaders import (
     TextLoader,
     PyPDFLoader,
     CSVLoader,
-    UnstructuredHTMLLoader,
+    BSHTMLLoader,
     Docx2txtLoader,
     UnstructuredEPubLoader,
     UnstructuredWordDocumentLoader,
@@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
     elif file_ext == "xml":
         loader = UnstructuredXMLLoader(file_path)
     elif file_ext in ["htm", "html"]:
-        loader = UnstructuredHTMLLoader(file_path)
+        loader = BSHTMLLoader(file_path)
     elif file_ext == "md":
         loader = UnstructuredMarkdownLoader(file_path)
     elif file_content_type == "application/epub+zip":

From 6307adfba1048c01a9954723f8d16b02fe984470 Mon Sep 17 00:00:00 2001
From: "Timothy J. Baek" <timothyjrbeck@gmail.com>
Date: Mon, 25 Mar 2024 23:47:08 -0700
Subject: [PATCH 4/5] feat: better error handling

---
 backend/apps/rag/main.py | 93 +++++++++++++++++++++++-----------------
 backend/constants.py     |  2 +
 2 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index 1e50ef20b..d87f7bc73 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -115,6 +115,7 @@ class CollectionNameForm(BaseModel):
 class StoreWebForm(CollectionNameForm):
     url: str
 
+
 @app.get("/")
 async def get_status():
     return {
@@ -297,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
 
 
 def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
+
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=app.state.CHUNK_SIZE,
         chunk_overlap=app.state.CHUNK_OVERLAP,
         add_start_index=True,
     )
     docs = text_splitter.split_documents(data)
-    return store_docs_in_vector_db(docs, collection_name, overwrite)
+
+    if len(docs) > 0:
+        return store_docs_in_vector_db(docs, collection_name, overwrite), None
+    else:
+        raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
 
 
 def store_text_in_vector_db(
@@ -319,6 +325,7 @@ def store_text_in_vector_db(
 
 
 def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
+
     texts = [doc.page_content for doc in docs]
     metadatas = [doc.metadata for doc in docs]
 
@@ -455,19 +462,21 @@ def store_doc(
 
         loader, known_type = get_loader(file.filename, file.content_type, file_path)
         data = loader.load()
-        result = store_data_in_vector_db(data, collection_name)
 
-        if result:
-            return {
-                "status": True,
-                "collection_name": collection_name,
-                "filename": filename,
-                "known_type": known_type,
-            }
-        else:
+        try:
+            result = store_data_in_vector_db(data, collection_name)
+
+            if result:
+                return {
+                    "status": True,
+                    "collection_name": collection_name,
+                    "filename": filename,
+                    "known_type": known_type,
+                }
+        except Exception as e:
             raise HTTPException(
                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail=ERROR_MESSAGES.DEFAULT(),
+                detail=e,
             )
     except Exception as e:
         log.exception(e)
@@ -532,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)):
                 )
                 data = loader.load()
 
-                result = store_data_in_vector_db(data, collection_name)
+                try:
+                    result = store_data_in_vector_db(data, collection_name)
 
-                if result:
-                    sanitized_filename = sanitize_filename(filename)
-                    doc = Documents.get_doc_by_name(sanitized_filename)
+                    if result:
+                        sanitized_filename = sanitize_filename(filename)
+                        doc = Documents.get_doc_by_name(sanitized_filename)
 
-                    if doc == None:
-                        doc = Documents.insert_new_doc(
-                            user.id,
-                            DocumentForm(
-                                **{
-                                    "name": sanitized_filename,
-                                    "title": filename,
-                                    "collection_name": collection_name,
-                                    "filename": filename,
-                                    "content": (
-                                        json.dumps(
-                                            {
-                                                "tags": list(
-                                                    map(
-                                                        lambda name: {"name": name},
-                                                        tags,
+                        if doc == None:
+                            doc = Documents.insert_new_doc(
+                                user.id,
+                                DocumentForm(
+                                    **{
+                                        "name": sanitized_filename,
+                                        "title": filename,
+                                        "collection_name": collection_name,
+                                        "filename": filename,
+                                        "content": (
+                                            json.dumps(
+                                                {
+                                                    "tags": list(
+                                                        map(
+                                                            lambda name: {"name": name},
+                                                            tags,
+                                                        )
                                                     )
-                                                )
-                                            }
-                                        )
-                                        if len(tags)
-                                        else "{}"
-                                    ),
-                                }
-                            ),
-                        )
+                                                }
+                                            )
+                                            if len(tags)
+                                            else "{}"
+                                        ),
+                                    }
+                                ),
+                            )
+                except Exception as e:
+                    print(e)
+                    pass
 
         except Exception as e:
             log.exception(e)
diff --git a/backend/constants.py b/backend/constants.py
index 42c5c85eb..8bcdd0789 100644
--- a/backend/constants.py
+++ b/backend/constants.py
@@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum):
     MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found"
     OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
     OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
+
+    EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding."

From 3688955c776c5c03afd94aa86636f1f8f80de738 Mon Sep 17 00:00:00 2001
From: "Timothy J. Baek" <timothyjrbeck@gmail.com>
Date: Mon, 25 Mar 2024 23:50:52 -0700
Subject: [PATCH 5/5] fix: encoding issue

---
 backend/apps/rag/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py
index d87f7bc73..da7bb307d 100644
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@@ -411,7 +411,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
     elif file_ext == "xml":
         loader = UnstructuredXMLLoader(file_path)
     elif file_ext in ["htm", "html"]:
-        loader = BSHTMLLoader(file_path)
+        loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
     elif file_ext == "md":
         loader = UnstructuredMarkdownLoader(file_path)
     elif file_content_type == "application/epub+zip":