mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-09 12:47:13 +02:00
@@ -23,7 +23,7 @@ def _check_should_run(current_time: int, last_pull: int, pull_frequency: int) ->
|
|||||||
return current_time - last_pull > pull_frequency * 60
|
return current_time - last_pull > pull_frequency * 60
|
||||||
|
|
||||||
|
|
||||||
async def run_update():
|
async def run_update() -> None:
|
||||||
logger.info("Running update")
|
logger.info("Running update")
|
||||||
# TODO (chris): implement a more generic way to run updates
|
# TODO (chris): implement a more generic way to run updates
|
||||||
# so we don't need to edit this file for future connectors
|
# so we don't need to edit this file for future connectors
|
||||||
|
@@ -35,6 +35,7 @@ GOOGLE_DRIVE_CREDENTIAL_JSON = os.environ.get("GOOGLE_DRIVE_CREDENTIAL_JSON", ""
|
|||||||
GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "")
|
GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "")
|
||||||
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||||
|
|
||||||
|
GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Query Configs
|
# Query Configs
|
||||||
|
@@ -17,3 +17,4 @@ class DocumentSource(str, Enum):
|
|||||||
SLACK = "slack"
|
SLACK = "slack"
|
||||||
WEB = "web"
|
WEB = "web"
|
||||||
GOOGLE_DRIVE = "google_drive"
|
GOOGLE_DRIVE = "google_drive"
|
||||||
|
GITHUB = "github"
|
||||||
|
0
backend/danswer/connectors/github/__init__.py
Normal file
0
backend/danswer/connectors/github/__init__.py
Normal file
61
backend/danswer/connectors/github/batch.py
Normal file
61
backend/danswer/connectors/github/batch.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import itertools
|
||||||
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import GITHUB_ACCESS_TOKEN
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.connectors.type_aliases import BatchLoader
|
||||||
|
from danswer.utils.logging import setup_logger
|
||||||
|
from github import Github
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
github_client = Github(GITHUB_ACCESS_TOKEN)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pr_batches(pull_requests, batch_size):
|
||||||
|
it = iter(pull_requests)
|
||||||
|
while True:
|
||||||
|
batch = list(itertools.islice(it, batch_size))
|
||||||
|
if not batch:
|
||||||
|
break
|
||||||
|
yield batch
|
||||||
|
|
||||||
|
|
||||||
|
class BatchGithubLoader(BatchLoader):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
repo_owner: str,
|
||||||
|
repo_name: str,
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
state_filter: str = "all",
|
||||||
|
) -> None:
|
||||||
|
self.repo_owner = repo_owner
|
||||||
|
self.repo_name = repo_name
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.state_filter = state_filter
|
||||||
|
|
||||||
|
def load(self) -> Generator[list[Document], None, None]:
|
||||||
|
repo = github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
|
||||||
|
pull_requests = repo.get_pulls(state=self.state_filter)
|
||||||
|
for pr_batch in get_pr_batches(pull_requests, self.batch_size):
|
||||||
|
doc_batch = []
|
||||||
|
for pull_request in pr_batch:
|
||||||
|
full_context = f"Pull-Request {pull_request.title} {pull_request.body}"
|
||||||
|
doc_batch.append(
|
||||||
|
Document(
|
||||||
|
id=pull_request.url,
|
||||||
|
sections=[Section(link=pull_request.url, text=full_context)],
|
||||||
|
source=DocumentSource.GITHUB,
|
||||||
|
semantic_identifier=pull_request.title,
|
||||||
|
metadata={
|
||||||
|
"last_modified": pull_request.last_modified,
|
||||||
|
"merged": pull_request.merged,
|
||||||
|
"state": pull_request.state,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
yield doc_batch
|
@@ -107,7 +107,7 @@ def retrieve_ranked_documents(
|
|||||||
return None
|
return None
|
||||||
ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size)
|
ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size)
|
||||||
|
|
||||||
top_docs = [ranked_chunk.document_id for ranked_chunk in ranked_chunks]
|
top_docs = [ranked_chunk.source_links["0"] for ranked_chunk in ranked_chunks]
|
||||||
files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}"
|
files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}"
|
||||||
logger.info(files_log_msg)
|
logger.info(files_log_msg)
|
||||||
|
|
||||||
|
@@ -34,11 +34,6 @@ def modify_slack_config(slack_config: SlackConfig):
|
|||||||
update_slack_config(slack_config)
|
update_slack_config(slack_config)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/connectors/slack/auth")
|
|
||||||
def modify_slack_config(slack_config: SlackConfig):
|
|
||||||
update_slack_config(slack_config)
|
|
||||||
|
|
||||||
|
|
||||||
class WebIndexAttemptRequest(BaseModel):
|
class WebIndexAttemptRequest(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
@@ -15,6 +15,7 @@ Mako==1.2.4
|
|||||||
openai==0.27.6
|
openai==0.27.6
|
||||||
playwright==1.32.1
|
playwright==1.32.1
|
||||||
pydantic==1.10.7
|
pydantic==1.10.7
|
||||||
|
PyGithub==1.58.2
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
pytest-playwright==0.3.2
|
pytest-playwright==0.3.2
|
||||||
qdrant-client==1.1.0
|
qdrant-client==1.1.0
|
||||||
|
@@ -5,6 +5,7 @@ from danswer.chunking.chunk import Chunker
|
|||||||
from danswer.chunking.chunk import DefaultChunker
|
from danswer.chunking.chunk import DefaultChunker
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||||
|
from danswer.connectors.github.batch import BatchGithubLoader
|
||||||
from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader
|
from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader
|
||||||
from danswer.connectors.slack.batch import BatchSlackLoader
|
from danswer.connectors.slack.batch import BatchSlackLoader
|
||||||
from danswer.connectors.type_aliases import BatchLoader
|
from danswer.connectors.type_aliases import BatchLoader
|
||||||
@@ -78,9 +79,23 @@ def load_google_drive_batch(qdrant_collection: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_github_batch(owner: str, repo: str, qdrant_collection: str):
|
||||||
|
logger.info("Loading documents from Github.")
|
||||||
|
load_batch(
|
||||||
|
BatchGithubLoader(
|
||||||
|
repo_owner=owner, repo_name=repo, batch_size=INDEX_BATCH_SIZE
|
||||||
|
),
|
||||||
|
DefaultChunker(),
|
||||||
|
DefaultEmbedder(),
|
||||||
|
QdrantDatastore(collection=qdrant_collection),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class BatchLoadingArgs(argparse.Namespace):
|
class BatchLoadingArgs(argparse.Namespace):
|
||||||
slack_export_dir: str
|
|
||||||
website_url: str
|
website_url: str
|
||||||
|
github_owner: str
|
||||||
|
github_repo: str
|
||||||
|
slack_export_dir: str
|
||||||
qdrant_collection: str
|
qdrant_collection: str
|
||||||
rebuild_index: bool
|
rebuild_index: bool
|
||||||
|
|
||||||
@@ -91,6 +106,14 @@ if __name__ == "__main__":
|
|||||||
"--website-url",
|
"--website-url",
|
||||||
default="https://docs.github.com/en/actions",
|
default="https://docs.github.com/en/actions",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--github-owner",
|
||||||
|
default="danswer-ai",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--github-repo",
|
||||||
|
default="danswer",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--slack-export-dir",
|
"--slack-export-dir",
|
||||||
default="/Users/chrisweaver/Downloads/test-slack-export",
|
default="/Users/chrisweaver/Downloads/test-slack-export",
|
||||||
@@ -109,6 +132,7 @@ if __name__ == "__main__":
|
|||||||
if args.rebuild_index:
|
if args.rebuild_index:
|
||||||
recreate_collection(args.qdrant_collection)
|
recreate_collection(args.qdrant_collection)
|
||||||
|
|
||||||
# load_slack_batch(args.slack_export_dir, args.qdrant_collection)
|
load_slack_batch(args.slack_export_dir, args.qdrant_collection)
|
||||||
load_web_batch(args.website_url, args.qdrant_collection)
|
load_web_batch(args.website_url, args.qdrant_collection)
|
||||||
# load_google_drive_batch(args.qdrant_collection)
|
load_google_drive_batch(args.qdrant_collection)
|
||||||
|
load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)
|
||||||
|
Reference in New Issue
Block a user