mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-19 12:03:54 +02:00
Add TODOs and minor style changes to web connector (#254)
This commit is contained in:
@@ -90,13 +90,16 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||
)
|
||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
|
||||
"sidebar,header,footer").split(",")
|
||||
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
|
||||
"nav,header,footer,meta,script,style,symbol,aside").split(",")
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
|
||||
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
|
||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
|
||||
).split(",")
|
||||
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
|
||||
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
|
||||
).split(",")
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
|
||||
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
|
||||
|
||||
#####
|
||||
# Query Configs
|
||||
|
@@ -10,24 +10,24 @@ from urllib.parse import urlparse
|
||||
import bs4
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import BrowserContext
|
||||
from playwright.sync_api import Playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from oauthlib.oauth2 import BackendApplicationClient
|
||||
from requests_oauthlib import OAuth2Session
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
from oauthlib.oauth2 import BackendApplicationClient
|
||||
from playwright.sync_api import BrowserContext
|
||||
from playwright.sync_api import Playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
from PyPDF2 import PdfReader
|
||||
from requests_oauthlib import OAuth2Session # type:ignore
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -123,13 +123,21 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
||||
|
||||
context = browser.new_context()
|
||||
|
||||
if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
|
||||
if (
|
||||
WEB_CONNECTOR_OAUTH_CLIENT_ID
|
||||
and WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
and WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
):
|
||||
client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
|
||||
oauth = OAuth2Session(client=client)
|
||||
token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
|
||||
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
|
||||
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
|
||||
context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
|
||||
token = oauth.fetch_token(
|
||||
token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
|
||||
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
|
||||
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET,
|
||||
)
|
||||
context.set_extra_http_headers(
|
||||
{"Authorization": "Bearer {}".format(token["access_token"])}
|
||||
)
|
||||
|
||||
return playwright, context
|
||||
|
||||
|
@@ -21,6 +21,7 @@ Mako==1.2.4
|
||||
nltk==3.8.1
|
||||
docx2txt==0.8
|
||||
openai==0.27.6
|
||||
oauthlib==3.2.2
|
||||
playwright==1.32.1
|
||||
psycopg2==2.9.6
|
||||
psycopg2-binary==2.9.6
|
||||
@@ -31,6 +32,7 @@ pytest-playwright==0.3.2
|
||||
python-multipart==0.0.6
|
||||
qdrant-client==1.2.0
|
||||
requests==2.31.0
|
||||
requests-oauthlib==1.3.1
|
||||
retry==0.9.2
|
||||
rfc3986==1.5.0
|
||||
sentence-transformers==2.2.2
|
||||
|
@@ -5,6 +5,7 @@ pre-commit==3.2.2
|
||||
reorder-python-imports==3.9.0
|
||||
types-beautifulsoup4==4.12.0.3
|
||||
types-html5lib==1.1.11.13
|
||||
types-oauthlib==3.2.0.9
|
||||
types-psycopg2==2.9.21.10
|
||||
types-python-dateutil==2.8.19.13
|
||||
types-regex==2023.3.23.1
|
||||
|
Reference in New Issue
Block a user