Improve Web Connector Output, Add Config Options and add OAuth Backend Flow (#199)

This commit is contained in:
jabdoa2
2023-07-29 21:21:23 +02:00
committed by GitHub
parent b6b549357f
commit 0d7d54fddb
2 changed files with 104 additions and 20 deletions

View File

@@ -90,6 +90,13 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
"sidebar,header,footer").split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
"nav,header,footer,meta,script,style,symbol,aside").split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
#####
# Query Configs

View File

@@ -1,22 +1,33 @@
import io
import re
from datetime import datetime
from typing import Any
from typing import cast
from typing import Tuple
from urllib.parse import urljoin
from urllib.parse import urlparse
import bs4
import requests
from bs4 import BeautifulSoup
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import HTML_SEPARATOR
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
logger = setup_logger()
@@ -50,6 +61,79 @@ def get_internal_links(
return internal_links
def strip_excessive_newlines_and_spaces(document: str) -> str:
# collapse repeated spaces into one
document = re.sub(r" +", " ", document)
# remove trailing spaces
document = re.sub(r" +[\n\r]", "\n", document)
# remove repeated newlines
document = re.sub(r"[\n\r]+", "\n", document)
return document.strip()
def strip_newlines(document: str) -> str:
# HTML might contain newlines which are just whitespaces to a browser
return re.sub(r"[\n\r]+", " ", document)
def format_document(document: BeautifulSoup) -> str:
"""Format html to a flat text document.
The following goals:
- Newlines from within the HTML are removed (as browser would ignore them as well).
- Repeated newlines/spaces are removed (as browsers would ignore them).
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
- Table columns/rows are separated by newline
- List elements are separated by newline and start with a hyphen
"""
text = ""
list_element_start = False
verbatim_output = 0
for e in document.descendants:
verbatim_output -= 1
if isinstance(e, bs4.element.NavigableString):
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
continue
element_text = e.text
if element_text:
if verbatim_output > 0:
text += element_text
else:
text += strip_newlines(element_text)
list_element_start = False
elif isinstance(e, bs4.element.Tag):
if e.name in ["p", "div"]:
if not list_element_start:
text += "\n"
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
text += "\n"
list_element_start = False
elif e.name == "li":
text += "\n- "
list_element_start = True
elif e.name == "pre":
if verbatim_output <= 0:
verbatim_output = len(list(e.childGenerator()))
return strip_excessive_newlines_and_spaces(text)
def start_playwright() -> Tuple[Playwright, BrowserContext]:
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
oauth = OAuth2Session(client=client)
token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
return playwright, context
class WebConnector(LoadConnector):
def __init__(
self,
@@ -73,7 +157,8 @@ class WebConnector(LoadConnector):
to_visit: list[str] = [self.base_url]
doc_batch: list[Document] = []
restart_playwright = True
playwright, context = start_playwright()
restart_playwright = False
while to_visit:
current_url = to_visit.pop()
if current_url in visited_links:
@@ -86,9 +171,7 @@ class WebConnector(LoadConnector):
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
if restart_playwright:
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
playwright, context = start_playwright()
restart_playwright = False
if current_url.split(".")[-1] == "pdf":
@@ -133,27 +216,21 @@ class WebConnector(LoadConnector):
title = None
if title_tag and title_tag.text:
title = title_tag.text
title_tag.extract()
# Heuristics based cleaning
for undesired_div in ["sidebar", "header", "footer"]:
# Heuristics based cleaning of elements based on css classes
for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
[
tag.extract()
for tag in soup.find_all(
"div", class_=lambda x: x and undesired_div in x.split()
class_=lambda x: x and undesired_element in x.split()
)
]
for undesired_tag in [
"nav",
"header",
"footer",
"meta",
"script",
"style",
]:
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)]
page_text = soup.get_text(HTML_SEPARATOR)
page_text = format_document(soup)
doc_batch.append(
Document(