From 784a6ec85e18b9b798fb7292acf9015beae7fada Mon Sep 17 00:00:00 2001 From: Doug Danat Date: Mon, 25 Mar 2024 09:50:53 +0100 Subject: [PATCH 1/5] include html langchain loader for RAG --- backend/apps/rag/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 48ca61666..163f1b0fa 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, + UnstructuredHTMLLoader, Docx2txtLoader, UnstructuredEPubLoader, UnstructuredWordDocumentLoader, @@ -402,6 +403,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) + elif file_ext in ["htm", "html"]: + loader = UnstructuredHTMLLoader(file_path) elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip": From 77f4ffddc1ce8cc57ce7227999fc87049c401605 Mon Sep 17 00:00:00 2001 From: Doug Danat Date: Mon, 25 Mar 2024 11:21:34 +0100 Subject: [PATCH 2/5] add htm/html to supported extensions in ui --- src/lib/constants.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib/constants.ts b/src/lib/constants.ts index bdd9c64e9..5adefe561 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [ 'text/plain', 'text/csv', 'text/xml', + 'text/html', 'text/x-python', 'text/css', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', @@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [ 'h', 'c', 'cs', + 'htm', + 'html', 'sql', 'log', 'ini', From c91a5d8b1fd36827f1b6e45ffc7ef9d36780a280 Mon Sep 17 00:00:00 2001 From: Doug Danat Date: Mon, 25 Mar 2024 11:26:18 +0100 Subject: [PATCH 3/5] switch to using BeautifulSoup HTML loader so title is also captured --- backend/apps/rag/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 163f1b0fa..1e50ef20b 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,7 +21,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, - UnstructuredHTMLLoader, + BSHTMLLoader, Docx2txtLoader, UnstructuredEPubLoader, UnstructuredWordDocumentLoader, @@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) elif file_ext in ["htm", "html"]: - loader = UnstructuredHTMLLoader(file_path) + loader = BSHTMLLoader(file_path) elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip": From 6307adfba1048c01a9954723f8d16b02fe984470 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Mon, 25 Mar 2024 23:47:08 -0700 Subject: [PATCH 4/5] feat: better error handling --- backend/apps/rag/main.py | 93 +++++++++++++++++++++++----------------- backend/constants.py | 2 + 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 1e50ef20b..d87f7bc73 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -115,6 +115,7 @@ class CollectionNameForm(BaseModel): class StoreWebForm(CollectionNameForm): url: str + @app.get("/") async def get_status(): return { @@ -297,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: + text_splitter = RecursiveCharacterTextSplitter( chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP, add_start_index=True, ) docs = text_splitter.split_documents(data) - return store_docs_in_vector_db(docs, collection_name, overwrite) + + if len(docs) > 0: + return store_docs_in_vector_db(docs, collection_name, overwrite), None + else: + raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) def store_text_in_vector_db( @@ -319,6 +325,7 @@ def store_text_in_vector_db( def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: + texts = [doc.page_content for doc in docs] metadatas = [doc.metadata for doc in docs] @@ -455,19 +462,21 @@ def store_doc( loader, known_type = get_loader(file.filename, file.content_type, file_path) data = loader.load() - result = store_data_in_vector_db(data, collection_name) - if result: - return { - "status": True, - "collection_name": collection_name, - "filename": filename, - "known_type": known_type, - } - else: + try: + result = store_data_in_vector_db(data, collection_name) + + if result: + return { + "status": True, + "collection_name": collection_name, + "filename": filename, + "known_type": known_type, + } + except Exception as e: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=ERROR_MESSAGES.DEFAULT(), + detail=e, ) except Exception as e: log.exception(e) @@ -532,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)): ) data = loader.load() - result = store_data_in_vector_db(data, collection_name) + try: + result = store_data_in_vector_db(data, collection_name) - if result: - sanitized_filename = sanitize_filename(filename) - doc = Documents.get_doc_by_name(sanitized_filename) + if result: + sanitized_filename = sanitize_filename(filename) + doc = Documents.get_doc_by_name(sanitized_filename) - if doc == None: - doc = Documents.insert_new_doc( - user.id, - DocumentForm( - **{ - "name": sanitized_filename, - "title": filename, - "collection_name": collection_name, - "filename": filename, - "content": ( - json.dumps( - { - "tags": list( - map( - lambda name: {"name": name}, - tags, + if doc == None: + doc = Documents.insert_new_doc( + user.id, + DocumentForm( + **{ + "name": sanitized_filename, + "title": filename, + "collection_name": collection_name, + "filename": filename, + "content": ( + json.dumps( + { + "tags": list( + map( + lambda name: {"name": name}, + tags, + ) ) - ) - } - ) - if len(tags) - else "{}" - ), - } - ), - ) + } + ) + if len(tags) + else "{}" + ), + } + ), + ) + except Exception as e: + print(e) + pass except Exception as e: log.exception(e) diff --git a/backend/constants.py b/backend/constants.py index 42c5c85eb..8bcdd0789 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum): MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found" OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found" OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama" + + EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding." From 3688955c776c5c03afd94aa86636f1f8f80de738 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Mon, 25 Mar 2024 23:50:52 -0700 Subject: [PATCH 5/5] fix: encoding issue --- backend/apps/rag/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index d87f7bc73..da7bb307d 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -411,7 +411,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) elif file_ext in ["htm", "html"]: - loader = BSHTMLLoader(file_path) + loader = BSHTMLLoader(file_path, open_encoding="unicode_escape") elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip":