From 8e5e11a55413a945a579d8e9c0d1945ad69281f0 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 20 Nov 2023 19:56:06 -0800 Subject: [PATCH] Add md files to File Connector (#749) --- CONTRIBUTING.md | 2 ++ README.md | 2 ++ backend/alembic/README.md | 7 ++-- backend/danswer/connectors/README.md | 2 ++ .../cross_connector_utils/file_utils.py | 32 ++++++++++++++++--- backend/danswer/connectors/file/connector.py | 6 ++-- backend/danswer/connectors/file/utils.py | 2 +- deployment/README.md | 2 ++ web/README.md | 2 ++ 9 files changed, 48 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 966fd09b0b..6f14fe5e19 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,5 @@ + + # Contributing to Danswer Hey there! We are so excited that you're interested in Danswer. diff --git a/README.md b/README.md index 9f4a01eff2..90047573a7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ + +

diff --git a/backend/alembic/README.md b/backend/alembic/README.md index ef9b50f569..3337cb4f23 100644 --- a/backend/alembic/README.md +++ b/backend/alembic/README.md @@ -1,4 +1,8 @@ -Generic single-database configuration with an async dbapi. + + +# Alembic DB Migrations +These files are for creating/updating the tables in the Relational DB (Postgres). +Danswer migrations use a generic single-database configuration with an async dbapi. ## To generate new migrations: run from danswer/backend: @@ -7,7 +11,6 @@ run from danswer/backend: More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html ## Running migrations - To run all un-applied migrations: `alembic upgrade head` diff --git a/backend/danswer/connectors/README.md b/backend/danswer/connectors/README.md index 315f11cb63..b50232fa25 100644 --- a/backend/danswer/connectors/README.md +++ b/backend/danswer/connectors/README.md @@ -1,3 +1,5 @@ + + # Writing a new Danswer Connector This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces, and required changes. diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py index 75c92111d6..a86c5aa6f7 100644 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -1,5 +1,6 @@ import json import os +import re import zipfile from collections.abc import Generator from pathlib import Path @@ -13,7 +14,25 @@ from danswer.utils.logger import setup_logger logger = setup_logger() -_METADATA_FLAG = "#DANSWER_METADATA=" + +def extract_metadata(line: str) -> dict | None: + html_comment_pattern = r"" + hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}" + + html_comment_match = re.search(html_comment_pattern, line) + hashtag_match = re.search(hashtag_pattern, line) + + if html_comment_match: + json_str = html_comment_match.group(1) + elif hashtag_match: + json_str = hashtag_match.group(1) + else: + return None + + try: + return json.loads("{" + json_str + "}") + except json.JSONDecodeError: + return None def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: @@ -66,7 +85,7 @@ def load_files_from_zip( yield file_info, file -def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]: +def read_file(file_reader: IO[Any]) -> tuple[str, dict]: metadata = {} file_content_raw = "" for ind, line in enumerate(file_reader): @@ -74,8 +93,13 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]: line = line.decode("utf-8") line = str(line) - if ind == 0 and line.startswith(_METADATA_FLAG): - metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip()) + if ind == 0: + metadata_or_none = extract_metadata(line) + if metadata_or_none is not None: + metadata = metadata_or_none + else: + file_content_raw += line + else: file_content_raw += line diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 72550f8595..d8daf5ce87 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -31,7 +31,7 @@ def _open_files_at_location( if extension == ".zip": for file_info, file in load_files_from_zip(file_path, ignore_dirs=True): yield file_info.filename, file - elif extension == ".txt" or extension == ".pdf": + elif extension in [".txt", ".pdf", ".md", ".mdx"]: mode = "r" if extension == ".pdf": mode = "rb" @@ -64,7 +64,9 @@ def _process_file( return [ Document( id=file_name, - sections=[Section(link=metadata.get("link", ""), text=file_content_raw)], + sections=[ + Section(link=metadata.get("link", ""), text=file_content_raw.strip()) + ], source=DocumentSource.FILE, semantic_identifier=file_name, doc_updated_at=time_updated, diff --git a/backend/danswer/connectors/file/utils.py b/backend/danswer/connectors/file/utils.py index 5bd3641bab..cb1f26f638 100644 --- a/backend/danswer/connectors/file/utils.py +++ b/backend/danswer/connectors/file/utils.py @@ -8,7 +8,7 @@ from typing import IO from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH -_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"] +_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"] def get_file_ext(file_path_or_name: str | Path) -> str: diff --git a/deployment/README.md b/deployment/README.md index a9dcbc429a..e3de372682 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -1,3 +1,5 @@ + + # Deploying Danswer The two options provided here are the easiest ways to get Danswer up and running. diff --git a/web/README.md b/web/README.md index ae7903b0f4..be2932bdc9 100644 --- a/web/README.md +++ b/web/README.md @@ -1,3 +1,5 @@ + + This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). ## Getting Started