From 17e2008027654d8cbce52ac84fefb6d964621685 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sat, 29 Jul 2023 12:35:38 -0700 Subject: [PATCH] Add TODOs and minor style changes to web connector (#254) --- backend/danswer/configs/app_configs.py | 17 +++++---- backend/danswer/connectors/web/connector.py | 38 +++++++++++++-------- backend/requirements/default.txt | 2 ++ backend/requirements/dev.txt | 1 + 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 8a1c6dd127e4..d099aa2cf8ae 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -90,13 +90,16 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage" ) -WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES", - "sidebar,header,footer").split(",") -WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS", - "nav,header,footer,meta,script,style,symbol,aside").split(",") -WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False) -WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False) -WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False) +# TODO these should be available for frontend configuration, via advanced options expandable +WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( + "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer" +).split(",") +WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get( + "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside" +).split(",") +WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID") +WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET") +WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL") ##### # Query Configs diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 37a8c80d4a15..a335d7f6d4aa 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -10,24 +10,24 @@ from urllib.parse import urlparse import bs4 import requests from bs4 import BeautifulSoup -from playwright.sync_api import BrowserContext -from playwright.sync_api import Playwright -from playwright.sync_api import sync_playwright -from PyPDF2 import PdfReader - -from oauthlib.oauth2 import BackendApplicationClient -from requests_oauthlib import OAuth2Session - -from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \ - WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL +from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS +from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID +from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET +from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.constants import DocumentSource from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger +from oauthlib.oauth2 import BackendApplicationClient +from playwright.sync_api import BrowserContext +from playwright.sync_api import Playwright +from playwright.sync_api import sync_playwright +from PyPDF2 import PdfReader +from requests_oauthlib import OAuth2Session # type:ignore logger = setup_logger() @@ -123,13 +123,21 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]: context = browser.new_context() - if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL: + if ( + WEB_CONNECTOR_OAUTH_CLIENT_ID + and WEB_CONNECTOR_OAUTH_CLIENT_SECRET + and WEB_CONNECTOR_OAUTH_TOKEN_URL + ): client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID) oauth = OAuth2Session(client=client) - token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL, - client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID, - client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET) - context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])}) + token = oauth.fetch_token( + token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL, + client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID, + client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET, + ) + context.set_extra_http_headers( + {"Authorization": "Bearer {}".format(token["access_token"])} + ) return playwright, context diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 5e9add21e84d..80cf7f52caef 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -21,6 +21,7 @@ Mako==1.2.4 nltk==3.8.1 docx2txt==0.8 openai==0.27.6 +oauthlib==3.2.2 playwright==1.32.1 psycopg2==2.9.6 psycopg2-binary==2.9.6 @@ -31,6 +32,7 @@ pytest-playwright==0.3.2 python-multipart==0.0.6 qdrant-client==1.2.0 requests==2.31.0 +requests-oauthlib==1.3.1 retry==0.9.2 rfc3986==1.5.0 sentence-transformers==2.2.2 diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt index e1e67f1ebc2b..f2a87ef85b31 100644 --- a/backend/requirements/dev.txt +++ b/backend/requirements/dev.txt @@ -5,6 +5,7 @@ pre-commit==3.2.2 reorder-python-imports==3.9.0 types-beautifulsoup4==4.12.0.3 types-html5lib==1.1.11.13 +types-oauthlib==3.2.0.9 types-psycopg2==2.9.21.10 types-python-dateutil==2.8.19.13 types-regex==2023.3.23.1