mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-29 17:20:44 +02:00
Add md files to File Connector (#749)
This commit is contained in:
@ -1,3 +1,5 @@
|
|||||||
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md"} -->
|
||||||
|
|
||||||
# Contributing to Danswer
|
# Contributing to Danswer
|
||||||
Hey there! We are so excited that you're interested in Danswer.
|
Hey there! We are so excited that you're interested in Danswer.
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/README.md"} -->
|
||||||
|
|
||||||
<h2 align="center">
|
<h2 align="center">
|
||||||
<a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a>
|
<a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a>
|
||||||
</h2>
|
</h2>
|
||||||
|
@ -1,4 +1,8 @@
|
|||||||
Generic single-database configuration with an async dbapi.
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/alembic/README.md"} -->
|
||||||
|
|
||||||
|
# Alembic DB Migrations
|
||||||
|
These files are for creating/updating the tables in the Relational DB (Postgres).
|
||||||
|
Danswer migrations use a generic single-database configuration with an async dbapi.
|
||||||
|
|
||||||
## To generate new migrations:
|
## To generate new migrations:
|
||||||
run from danswer/backend:
|
run from danswer/backend:
|
||||||
@ -7,7 +11,6 @@ run from danswer/backend:
|
|||||||
More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html
|
More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html
|
||||||
|
|
||||||
## Running migrations
|
## Running migrations
|
||||||
|
|
||||||
To run all un-applied migrations:
|
To run all un-applied migrations:
|
||||||
`alembic upgrade head`
|
`alembic upgrade head`
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/backend/danswer/connectors/README.md"} -->
|
||||||
|
|
||||||
# Writing a new Danswer Connector
|
# Writing a new Danswer Connector
|
||||||
This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces,
|
This README covers how to contribute a new Connector for Danswer. It includes an overview of the design, interfaces,
|
||||||
and required changes.
|
and required changes.
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -13,7 +14,25 @@ from danswer.utils.logger import setup_logger
|
|||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
|
||||||
|
def extract_metadata(line: str) -> dict | None:
|
||||||
|
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
||||||
|
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
||||||
|
|
||||||
|
html_comment_match = re.search(html_comment_pattern, line)
|
||||||
|
hashtag_match = re.search(hashtag_pattern, line)
|
||||||
|
|
||||||
|
if html_comment_match:
|
||||||
|
json_str = html_comment_match.group(1)
|
||||||
|
elif hashtag_match:
|
||||||
|
json_str = hashtag_match.group(1)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads("{" + json_str + "}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||||
@ -66,7 +85,7 @@ def load_files_from_zip(
|
|||||||
yield file_info, file
|
yield file_info, file
|
||||||
|
|
||||||
|
|
||||||
def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
|
def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
|
||||||
metadata = {}
|
metadata = {}
|
||||||
file_content_raw = ""
|
file_content_raw = ""
|
||||||
for ind, line in enumerate(file_reader):
|
for ind, line in enumerate(file_reader):
|
||||||
@ -74,8 +93,13 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
|
|||||||
line = line.decode("utf-8")
|
line = line.decode("utf-8")
|
||||||
line = str(line)
|
line = str(line)
|
||||||
|
|
||||||
if ind == 0 and line.startswith(_METADATA_FLAG):
|
if ind == 0:
|
||||||
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
|
metadata_or_none = extract_metadata(line)
|
||||||
|
if metadata_or_none is not None:
|
||||||
|
metadata = metadata_or_none
|
||||||
|
else:
|
||||||
|
file_content_raw += line
|
||||||
|
|
||||||
else:
|
else:
|
||||||
file_content_raw += line
|
file_content_raw += line
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ def _open_files_at_location(
|
|||||||
if extension == ".zip":
|
if extension == ".zip":
|
||||||
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
|
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
|
||||||
yield file_info.filename, file
|
yield file_info.filename, file
|
||||||
elif extension == ".txt" or extension == ".pdf":
|
elif extension in [".txt", ".pdf", ".md", ".mdx"]:
|
||||||
mode = "r"
|
mode = "r"
|
||||||
if extension == ".pdf":
|
if extension == ".pdf":
|
||||||
mode = "rb"
|
mode = "rb"
|
||||||
@ -64,7 +64,9 @@ def _process_file(
|
|||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
id=file_name,
|
id=file_name,
|
||||||
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
|
sections=[
|
||||||
|
Section(link=metadata.get("link", ""), text=file_content_raw.strip())
|
||||||
|
],
|
||||||
source=DocumentSource.FILE,
|
source=DocumentSource.FILE,
|
||||||
semantic_identifier=file_name,
|
semantic_identifier=file_name,
|
||||||
doc_updated_at=time_updated,
|
doc_updated_at=time_updated,
|
||||||
|
@ -8,7 +8,7 @@ from typing import IO
|
|||||||
|
|
||||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||||
|
|
||||||
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"]
|
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
|
||||||
|
|
||||||
|
|
||||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/deployment/README.md"} -->
|
||||||
|
|
||||||
# Deploying Danswer
|
# Deploying Danswer
|
||||||
The two options provided here are the easiest ways to get Danswer up and running.
|
The two options provided here are the easiest ways to get Danswer up and running.
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
<!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/web/README.md"} -->
|
||||||
|
|
||||||
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
|
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
Reference in New Issue
Block a user