Add TODOs and minor style changes to web connector (#254)

This commit is contained in:
Yuhong Sun
2023-07-29 12:35:38 -07:00
committed by GitHub
parent 0d7d54fddb
commit 17e2008027
4 changed files with 36 additions and 22 deletions

View File

@@ -90,13 +90,16 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
"sidebar,header,footer").split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
"nav,header,footer,meta,script,style,symbol,aside").split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
# TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
).split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
).split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
#####
# Query Configs

View File

@@ -10,24 +10,24 @@ from urllib.parse import urlparse
import bs4
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
from oauthlib.oauth2 import BackendApplicationClient
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
from requests_oauthlib import OAuth2Session # type:ignore
logger = setup_logger()
@@ -123,13 +123,21 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
context = browser.new_context()
if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
if (
WEB_CONNECTOR_OAUTH_CLIENT_ID
and WEB_CONNECTOR_OAUTH_CLIENT_SECRET
and WEB_CONNECTOR_OAUTH_TOKEN_URL
):
client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
oauth = OAuth2Session(client=client)
token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
token = oauth.fetch_token(
token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET,
)
context.set_extra_http_headers(
{"Authorization": "Bearer {}".format(token["access_token"])}
)
return playwright, context

View File

@@ -21,6 +21,7 @@ Mako==1.2.4
nltk==3.8.1
docx2txt==0.8
openai==0.27.6
oauthlib==3.2.2
playwright==1.32.1
psycopg2==2.9.6
psycopg2-binary==2.9.6
@@ -31,6 +32,7 @@ pytest-playwright==0.3.2
python-multipart==0.0.6
qdrant-client==1.2.0
requests==2.31.0
requests-oauthlib==1.3.1
retry==0.9.2
rfc3986==1.5.0
sentence-transformers==2.2.2

View File

@@ -5,6 +5,7 @@ pre-commit==3.2.2
reorder-python-imports==3.9.0
types-beautifulsoup4==4.12.0.3
types-html5lib==1.1.11.13
types-oauthlib==3.2.0.9
types-psycopg2==2.9.21.10
types-python-dateutil==2.8.19.13
types-regex==2023.3.23.1