mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 04:18:32 +02:00
parent
279c5e0eb1
commit
0b610502e0
@ -23,7 +23,7 @@ def _check_should_run(current_time: int, last_pull: int, pull_frequency: int) ->
|
||||
return current_time - last_pull > pull_frequency * 60
|
||||
|
||||
|
||||
async def run_update():
|
||||
async def run_update() -> None:
|
||||
logger.info("Running update")
|
||||
# TODO (chris): implement a more generic way to run updates
|
||||
# so we don't need to edit this file for future connectors
|
||||
|
@ -35,6 +35,7 @@ GOOGLE_DRIVE_CREDENTIAL_JSON = os.environ.get("GOOGLE_DRIVE_CREDENTIAL_JSON", ""
|
||||
GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "")
|
||||
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
|
||||
GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
|
||||
|
||||
#####
|
||||
# Query Configs
|
||||
|
@ -17,3 +17,4 @@ class DocumentSource(str, Enum):
|
||||
SLACK = "slack"
|
||||
WEB = "web"
|
||||
GOOGLE_DRIVE = "google_drive"
|
||||
GITHUB = "github"
|
||||
|
0
backend/danswer/connectors/github/__init__.py
Normal file
0
backend/danswer/connectors/github/__init__.py
Normal file
61
backend/danswer/connectors/github/batch.py
Normal file
61
backend/danswer/connectors/github/batch.py
Normal file
@ -0,0 +1,61 @@
|
||||
import itertools
|
||||
from collections.abc import Generator
|
||||
|
||||
from danswer.configs.app_configs import GITHUB_ACCESS_TOKEN
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.type_aliases import BatchLoader
|
||||
from danswer.utils.logging import setup_logger
|
||||
from github import Github
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
github_client = Github(GITHUB_ACCESS_TOKEN)
|
||||
|
||||
|
||||
def get_pr_batches(pull_requests, batch_size):
|
||||
it = iter(pull_requests)
|
||||
while True:
|
||||
batch = list(itertools.islice(it, batch_size))
|
||||
if not batch:
|
||||
break
|
||||
yield batch
|
||||
|
||||
|
||||
class BatchGithubLoader(BatchLoader):
|
||||
def __init__(
|
||||
self,
|
||||
repo_owner: str,
|
||||
repo_name: str,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
state_filter: str = "all",
|
||||
) -> None:
|
||||
self.repo_owner = repo_owner
|
||||
self.repo_name = repo_name
|
||||
self.batch_size = batch_size
|
||||
self.state_filter = state_filter
|
||||
|
||||
def load(self) -> Generator[list[Document], None, None]:
|
||||
repo = github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
|
||||
pull_requests = repo.get_pulls(state=self.state_filter)
|
||||
for pr_batch in get_pr_batches(pull_requests, self.batch_size):
|
||||
doc_batch = []
|
||||
for pull_request in pr_batch:
|
||||
full_context = f"Pull-Request {pull_request.title} {pull_request.body}"
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=pull_request.url,
|
||||
sections=[Section(link=pull_request.url, text=full_context)],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=pull_request.title,
|
||||
metadata={
|
||||
"last_modified": pull_request.last_modified,
|
||||
"merged": pull_request.merged,
|
||||
"state": pull_request.state,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
yield doc_batch
|
@ -107,7 +107,7 @@ def retrieve_ranked_documents(
|
||||
return None
|
||||
ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size)
|
||||
|
||||
top_docs = [ranked_chunk.document_id for ranked_chunk in ranked_chunks]
|
||||
top_docs = [ranked_chunk.source_links["0"] for ranked_chunk in ranked_chunks]
|
||||
files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}"
|
||||
logger.info(files_log_msg)
|
||||
|
||||
|
@ -34,11 +34,6 @@ def modify_slack_config(slack_config: SlackConfig):
|
||||
update_slack_config(slack_config)
|
||||
|
||||
|
||||
@router.post("/connectors/slack/auth")
|
||||
def modify_slack_config(slack_config: SlackConfig):
|
||||
update_slack_config(slack_config)
|
||||
|
||||
|
||||
class WebIndexAttemptRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
|
@ -15,6 +15,7 @@ Mako==1.2.4
|
||||
openai==0.27.6
|
||||
playwright==1.32.1
|
||||
pydantic==1.10.7
|
||||
PyGithub==1.58.2
|
||||
PyPDF2==3.0.1
|
||||
pytest-playwright==0.3.2
|
||||
qdrant-client==1.1.0
|
||||
|
@ -5,6 +5,7 @@ from danswer.chunking.chunk import Chunker
|
||||
from danswer.chunking.chunk import DefaultChunker
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||
from danswer.connectors.github.batch import BatchGithubLoader
|
||||
from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader
|
||||
from danswer.connectors.slack.batch import BatchSlackLoader
|
||||
from danswer.connectors.type_aliases import BatchLoader
|
||||
@ -78,9 +79,23 @@ def load_google_drive_batch(qdrant_collection: str):
|
||||
)
|
||||
|
||||
|
||||
def load_github_batch(owner: str, repo: str, qdrant_collection: str):
|
||||
logger.info("Loading documents from Github.")
|
||||
load_batch(
|
||||
BatchGithubLoader(
|
||||
repo_owner=owner, repo_name=repo, batch_size=INDEX_BATCH_SIZE
|
||||
),
|
||||
DefaultChunker(),
|
||||
DefaultEmbedder(),
|
||||
QdrantDatastore(collection=qdrant_collection),
|
||||
)
|
||||
|
||||
|
||||
class BatchLoadingArgs(argparse.Namespace):
|
||||
slack_export_dir: str
|
||||
website_url: str
|
||||
github_owner: str
|
||||
github_repo: str
|
||||
slack_export_dir: str
|
||||
qdrant_collection: str
|
||||
rebuild_index: bool
|
||||
|
||||
@ -91,6 +106,14 @@ if __name__ == "__main__":
|
||||
"--website-url",
|
||||
default="https://docs.github.com/en/actions",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github-owner",
|
||||
default="danswer-ai",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--github-repo",
|
||||
default="danswer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--slack-export-dir",
|
||||
default="/Users/chrisweaver/Downloads/test-slack-export",
|
||||
@ -109,6 +132,7 @@ if __name__ == "__main__":
|
||||
if args.rebuild_index:
|
||||
recreate_collection(args.qdrant_collection)
|
||||
|
||||
# load_slack_batch(args.slack_export_dir, args.qdrant_collection)
|
||||
load_slack_batch(args.slack_export_dir, args.qdrant_collection)
|
||||
load_web_batch(args.website_url, args.qdrant_collection)
|
||||
# load_google_drive_batch(args.qdrant_collection)
|
||||
load_google_drive_batch(args.qdrant_collection)
|
||||
load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)
|
||||
|
Loading…
x
Reference in New Issue
Block a user