From 8e5e11a55413a945a579d8e9c0d1945ad69281f0 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Mon, 20 Nov 2023 19:56:06 -0800
Subject: [PATCH] Add md files to File Connector (#749)

---
 CONTRIBUTING.md                               |  2 ++
 README.md                                     |  2 ++
 backend/alembic/README.md                     |  7 ++--
 backend/danswer/connectors/README.md          |  2 ++
 .../cross_connector_utils/file_utils.py       | 32 ++++++++++++++++---
 backend/danswer/connectors/file/connector.py  |  6 ++--
 backend/danswer/connectors/file/utils.py      |  2 +-
 deployment/README.md                          |  2 ++
 web/README.md                                 |  2 ++
 9 files changed, 48 insertions(+), 9 deletions(-)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 966fd09b0b..6f14fe5e19 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,5 @@
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md"} -->
+
 # Contributing to Danswer
 Hey there! We are so excited that you're interested in Danswer.
 
diff --git a/README.md b/README.md
index 9f4a01eff2..90047573a7 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/README.md"} -->
+
 <h2 align="center">
 <a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a>
 </h2>
diff --git a/backend/alembic/README.md b/backend/alembic/README.md
index ef9b50f569..3337cb4f23 100644
--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -1,4 +1,8 @@
-Generic single-database configuration with an async dbapi.
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/alembic/README.md"} -->
+
+# Alembic DB Migrations
+These files are for creating/updating the tables in the Relational DB (Postgres).
+Danswer migrations use a generic single-database configuration with an async dbapi.
 
 ## To generate new migrations: 
 run from danswer/backend:
@@ -7,7 +11,6 @@ run from danswer/backend:
 More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html
 
 ## Running migrations
-
 To run all un-applied migrations:
 `alembic upgrade head`
 
diff --git a/backend/danswer/connectors/README.md b/backend/danswer/connectors/README.md
index 315f11cb63..b50232fa25 100644
--- a/backend/danswer/connectors/README.md
+++ b/backend/danswer/connectors/README.md
@@ -1,3 +1,5 @@
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/danswer/connectors/README.md"} -->
+
 # Writing a new Danswer Connector
 This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces,
 and required changes.
diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py
index 75c92111d6..a86c5aa6f7 100644
--- a/backend/danswer/connectors/cross_connector_utils/file_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import zipfile
 from collections.abc import Generator
 from pathlib import Path
@@ -13,7 +14,25 @@ from danswer.utils.logger import setup_logger
 
 logger = setup_logger()
 
-_METADATA_FLAG = "#DANSWER_METADATA="
+
+def extract_metadata(line: str) -> dict | None:
+    html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
+    hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
+
+    html_comment_match = re.search(html_comment_pattern, line)
+    hashtag_match = re.search(hashtag_pattern, line)
+
+    if html_comment_match:
+        json_str = html_comment_match.group(1)
+    elif hashtag_match:
+        json_str = hashtag_match.group(1)
+    else:
+        return None
+
+    try:
+        return json.loads("{" + json_str + "}")
+    except json.JSONDecodeError:
+        return None
 
 
 def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
@@ -66,7 +85,7 @@ def load_files_from_zip(
                 yield file_info, file
 
 
-def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
+def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
     metadata = {}
     file_content_raw = ""
     for ind, line in enumerate(file_reader):
@@ -74,8 +93,13 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
             line = line.decode("utf-8")
         line = str(line)
 
-        if ind == 0 and line.startswith(_METADATA_FLAG):
-            metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
+        if ind == 0:
+            metadata_or_none = extract_metadata(line)
+            if metadata_or_none is not None:
+                metadata = metadata_or_none
+            else:
+                file_content_raw += line
+
         else:
             file_content_raw += line
 
diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py
index 72550f8595..d8daf5ce87 100644
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -31,7 +31,7 @@ def _open_files_at_location(
     if extension == ".zip":
         for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
             yield file_info.filename, file
-    elif extension == ".txt" or extension == ".pdf":
+    elif extension in [".txt", ".pdf", ".md", ".mdx"]:
         mode = "r"
         if extension == ".pdf":
             mode = "rb"
@@ -64,7 +64,9 @@ def _process_file(
     return [
         Document(
             id=file_name,
-            sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
+            sections=[
+                Section(link=metadata.get("link", ""), text=file_content_raw.strip())
+            ],
             source=DocumentSource.FILE,
             semantic_identifier=file_name,
             doc_updated_at=time_updated,
diff --git a/backend/danswer/connectors/file/utils.py b/backend/danswer/connectors/file/utils.py
index 5bd3641bab..cb1f26f638 100644
--- a/backend/danswer/connectors/file/utils.py
+++ b/backend/danswer/connectors/file/utils.py
@@ -8,7 +8,7 @@ from typing import IO
 
 from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
 
-_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"]
+_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
 
 
 def get_file_ext(file_path_or_name: str | Path) -> str:
diff --git a/deployment/README.md b/deployment/README.md
index a9dcbc429a..e3de372682 100644
--- a/deployment/README.md
+++ b/deployment/README.md
@@ -1,3 +1,5 @@
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/deployment/README.md"} -->
+
 # Deploying Danswer
 The two options provided here are the easiest ways to get Danswer up and running.
 
diff --git a/web/README.md b/web/README.md
index ae7903b0f4..be2932bdc9 100644
--- a/web/README.md
+++ b/web/README.md
@@ -1,3 +1,5 @@
+<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/web/README.md"} -->
+
 This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
 
 ## Getting Started