Add md files to File Connector (#749)

This commit is contained in:
Yuhong Sun
2023-11-20 19:56:06 -08:00
committed by GitHub
parent 57f0323f52
commit 8e5e11a554
9 changed files with 48 additions and 9 deletions

View File

@ -1,3 +1,5 @@
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md"} -->
# Contributing to Danswer # Contributing to Danswer
Hey there! We are so excited that you're interested in Danswer. Hey there! We are so excited that you're interested in Danswer.

View File

@ -1,3 +1,5 @@
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/README.md"} -->
<h2 align="center"> <h2 align="center">
<a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a> <a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a>
</h2> </h2>

View File

@ -1,4 +1,8 @@
Generic single-database configuration with an async dbapi. <!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/alembic/README.md"} -->
# Alembic DB Migrations
These files are for creating/updating the tables in the Relational DB (Postgres).
Danswer migrations use a generic single-database configuration with an async dbapi.
## To generate new migrations: ## To generate new migrations:
run from danswer/backend: run from danswer/backend:
@ -7,7 +11,6 @@ run from danswer/backend:
More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html
## Running migrations ## Running migrations
To run all un-applied migrations: To run all un-applied migrations:
`alembic upgrade head` `alembic upgrade head`

View File

@ -1,3 +1,5 @@
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/danswer/connectors/README.md"} -->
# Writing a new Danswer Connector # Writing a new Danswer Connector
This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces, This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces,
and required changes. and required changes.

View File

@ -1,5 +1,6 @@
import json import json
import os import os
import re
import zipfile import zipfile
from collections.abc import Generator from collections.abc import Generator
from pathlib import Path from pathlib import Path
@ -13,7 +14,25 @@ from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
_METADATA_FLAG = "#DANSWER_METADATA="
def extract_metadata(line: str) -> dict | None:
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
html_comment_match = re.search(html_comment_pattern, line)
hashtag_match = re.search(hashtag_pattern, line)
if html_comment_match:
json_str = html_comment_match.group(1)
elif hashtag_match:
json_str = hashtag_match.group(1)
else:
return None
try:
return json.loads("{" + json_str + "}")
except json.JSONDecodeError:
return None
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
@ -66,7 +85,7 @@ def load_files_from_zip(
yield file_info, file yield file_info, file
def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]: def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
metadata = {} metadata = {}
file_content_raw = "" file_content_raw = ""
for ind, line in enumerate(file_reader): for ind, line in enumerate(file_reader):
@ -74,8 +93,13 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
line = line.decode("utf-8") line = line.decode("utf-8")
line = str(line) line = str(line)
if ind == 0 and line.startswith(_METADATA_FLAG): if ind == 0:
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip()) metadata_or_none = extract_metadata(line)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else: else:
file_content_raw += line file_content_raw += line

View File

@ -31,7 +31,7 @@ def _open_files_at_location(
if extension == ".zip": if extension == ".zip":
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True): for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
yield file_info.filename, file yield file_info.filename, file
elif extension == ".txt" or extension == ".pdf": elif extension in [".txt", ".pdf", ".md", ".mdx"]:
mode = "r" mode = "r"
if extension == ".pdf": if extension == ".pdf":
mode = "rb" mode = "rb"
@ -64,7 +64,9 @@ def _process_file(
return [ return [
Document( Document(
id=file_name, id=file_name,
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)], sections=[
Section(link=metadata.get("link", ""), text=file_content_raw.strip())
],
source=DocumentSource.FILE, source=DocumentSource.FILE,
semantic_identifier=file_name, semantic_identifier=file_name,
doc_updated_at=time_updated, doc_updated_at=time_updated,

View File

@ -8,7 +8,7 @@ from typing import IO
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"] _VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
def get_file_ext(file_path_or_name: str | Path) -> str: def get_file_ext(file_path_or_name: str | Path) -> str:

View File

@ -1,3 +1,5 @@
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/deployment/README.md"} -->
# Deploying Danswer # Deploying Danswer
The two options provided here are the easiest ways to get Danswer up and running. The two options provided here are the easiest ways to get Danswer up and running.

View File

@ -1,3 +1,5 @@
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/web/README.md"} -->
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app). This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
## Getting Started ## Getting Started