DAN-54 Github PR Connector (#29)

also fixed some mypy stuff as well
This commit is contained in:
Yuhong Sun 2023-05-11 18:47:32 -07:00 committed by GitHub
parent 279c5e0eb1
commit 0b610502e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 93 additions and 10 deletions

View File

@ -23,7 +23,7 @@ def _check_should_run(current_time: int, last_pull: int, pull_frequency: int) ->
return current_time - last_pull > pull_frequency * 60
async def run_update():
async def run_update() -> None:
logger.info("Running update")
# TODO (chris): implement a more generic way to run updates
# so we don't need to edit this file for future connectors

View File

@ -35,6 +35,7 @@ GOOGLE_DRIVE_CREDENTIAL_JSON = os.environ.get("GOOGLE_DRIVE_CREDENTIAL_JSON", ""
GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "")
GOOGLE_DRIVE_INCLUDE_SHARED = False
GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
#####
# Query Configs

View File

@ -17,3 +17,4 @@ class DocumentSource(str, Enum):
SLACK = "slack"
WEB = "web"
GOOGLE_DRIVE = "google_drive"
GITHUB = "github"

View File

@ -0,0 +1,61 @@
import itertools
from collections.abc import Generator
from danswer.configs.app_configs import GITHUB_ACCESS_TOKEN
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.type_aliases import BatchLoader
from danswer.utils.logging import setup_logger
from github import Github
logger = setup_logger()
github_client = Github(GITHUB_ACCESS_TOKEN)
def get_pr_batches(pull_requests, batch_size):
it = iter(pull_requests)
while True:
batch = list(itertools.islice(it, batch_size))
if not batch:
break
yield batch
class BatchGithubLoader(BatchLoader):
def __init__(
self,
repo_owner: str,
repo_name: str,
batch_size: int = INDEX_BATCH_SIZE,
state_filter: str = "all",
) -> None:
self.repo_owner = repo_owner
self.repo_name = repo_name
self.batch_size = batch_size
self.state_filter = state_filter
def load(self) -> Generator[list[Document], None, None]:
repo = github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
pull_requests = repo.get_pulls(state=self.state_filter)
for pr_batch in get_pr_batches(pull_requests, self.batch_size):
doc_batch = []
for pull_request in pr_batch:
full_context = f"Pull-Request {pull_request.title} {pull_request.body}"
doc_batch.append(
Document(
id=pull_request.url,
sections=[Section(link=pull_request.url, text=full_context)],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
metadata={
"last_modified": pull_request.last_modified,
"merged": pull_request.merged,
"state": pull_request.state,
},
)
)
yield doc_batch

View File

@ -107,7 +107,7 @@ def retrieve_ranked_documents(
return None
ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size)
top_docs = [ranked_chunk.document_id for ranked_chunk in ranked_chunks]
top_docs = [ranked_chunk.source_links["0"] for ranked_chunk in ranked_chunks]
files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}"
logger.info(files_log_msg)

View File

@ -34,11 +34,6 @@ def modify_slack_config(slack_config: SlackConfig):
update_slack_config(slack_config)
@router.post("/connectors/slack/auth")
def modify_slack_config(slack_config: SlackConfig):
update_slack_config(slack_config)
class WebIndexAttemptRequest(BaseModel):
url: str

View File

@ -15,6 +15,7 @@ Mako==1.2.4
openai==0.27.6
playwright==1.32.1
pydantic==1.10.7
PyGithub==1.58.2
PyPDF2==3.0.1
pytest-playwright==0.3.2
qdrant-client==1.1.0

View File

@ -5,6 +5,7 @@ from danswer.chunking.chunk import Chunker
from danswer.chunking.chunk import DefaultChunker
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
from danswer.connectors.github.batch import BatchGithubLoader
from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader
from danswer.connectors.slack.batch import BatchSlackLoader
from danswer.connectors.type_aliases import BatchLoader
@ -78,9 +79,23 @@ def load_google_drive_batch(qdrant_collection: str):
)
def load_github_batch(owner: str, repo: str, qdrant_collection: str):
logger.info("Loading documents from Github.")
load_batch(
BatchGithubLoader(
repo_owner=owner, repo_name=repo, batch_size=INDEX_BATCH_SIZE
),
DefaultChunker(),
DefaultEmbedder(),
QdrantDatastore(collection=qdrant_collection),
)
class BatchLoadingArgs(argparse.Namespace):
slack_export_dir: str
website_url: str
github_owner: str
github_repo: str
slack_export_dir: str
qdrant_collection: str
rebuild_index: bool
@ -91,6 +106,14 @@ if __name__ == "__main__":
"--website-url",
default="https://docs.github.com/en/actions",
)
parser.add_argument(
"--github-owner",
default="danswer-ai",
)
parser.add_argument(
"--github-repo",
default="danswer",
)
parser.add_argument(
"--slack-export-dir",
default="/Users/chrisweaver/Downloads/test-slack-export",
@ -109,6 +132,7 @@ if __name__ == "__main__":
if args.rebuild_index:
recreate_collection(args.qdrant_collection)
# load_slack_batch(args.slack_export_dir, args.qdrant_collection)
load_slack_batch(args.slack_export_dir, args.qdrant_collection)
load_web_batch(args.website_url, args.qdrant_collection)
# load_google_drive_batch(args.qdrant_collection)
load_google_drive_batch(args.qdrant_collection)
load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)