mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-29 05:15:12 +02:00
Improve Web Connector Output, Add Config Options and add OAuth Backend Flow (#199)
This commit is contained in:
@@ -90,6 +90,13 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
|
|||||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||||
)
|
)
|
||||||
|
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
|
||||||
|
"sidebar,header,footer").split(",")
|
||||||
|
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
|
||||||
|
"nav,header,footer,meta,script,style,symbol,aside").split(",")
|
||||||
|
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
|
||||||
|
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
|
||||||
|
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Query Configs
|
# Query Configs
|
||||||
|
@@ -1,22 +1,33 @@
|
|||||||
import io
|
import io
|
||||||
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
from typing import Tuple
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from playwright.sync_api import BrowserContext
|
||||||
|
from playwright.sync_api import Playwright
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
from oauthlib.oauth2 import BackendApplicationClient
|
||||||
|
from requests_oauthlib import OAuth2Session
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
|
||||||
|
WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||||
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||||
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.configs.constants import HTML_SEPARATOR
|
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
from PyPDF2 import PdfReader
|
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
@@ -50,6 +61,79 @@ def get_internal_links(
|
|||||||
return internal_links
|
return internal_links
|
||||||
|
|
||||||
|
|
||||||
|
def strip_excessive_newlines_and_spaces(document: str) -> str:
|
||||||
|
# collapse repeated spaces into one
|
||||||
|
document = re.sub(r" +", " ", document)
|
||||||
|
# remove trailing spaces
|
||||||
|
document = re.sub(r" +[\n\r]", "\n", document)
|
||||||
|
# remove repeated newlines
|
||||||
|
document = re.sub(r"[\n\r]+", "\n", document)
|
||||||
|
return document.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_newlines(document: str) -> str:
|
||||||
|
# HTML might contain newlines which are just whitespaces to a browser
|
||||||
|
return re.sub(r"[\n\r]+", " ", document)
|
||||||
|
|
||||||
|
|
||||||
|
def format_document(document: BeautifulSoup) -> str:
|
||||||
|
"""Format html to a flat text document.
|
||||||
|
|
||||||
|
The following goals:
|
||||||
|
- Newlines from within the HTML are removed (as browser would ignore them as well).
|
||||||
|
- Repeated newlines/spaces are removed (as browsers would ignore them).
|
||||||
|
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
|
||||||
|
- Table columns/rows are separated by newline
|
||||||
|
- List elements are separated by newline and start with a hyphen
|
||||||
|
"""
|
||||||
|
text = ""
|
||||||
|
list_element_start = False
|
||||||
|
verbatim_output = 0
|
||||||
|
for e in document.descendants:
|
||||||
|
verbatim_output -= 1
|
||||||
|
if isinstance(e, bs4.element.NavigableString):
|
||||||
|
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
|
||||||
|
continue
|
||||||
|
element_text = e.text
|
||||||
|
if element_text:
|
||||||
|
if verbatim_output > 0:
|
||||||
|
text += element_text
|
||||||
|
else:
|
||||||
|
text += strip_newlines(element_text)
|
||||||
|
list_element_start = False
|
||||||
|
elif isinstance(e, bs4.element.Tag):
|
||||||
|
if e.name in ["p", "div"]:
|
||||||
|
if not list_element_start:
|
||||||
|
text += "\n"
|
||||||
|
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
||||||
|
text += "\n"
|
||||||
|
list_element_start = False
|
||||||
|
elif e.name == "li":
|
||||||
|
text += "\n- "
|
||||||
|
list_element_start = True
|
||||||
|
elif e.name == "pre":
|
||||||
|
if verbatim_output <= 0:
|
||||||
|
verbatim_output = len(list(e.childGenerator()))
|
||||||
|
return strip_excessive_newlines_and_spaces(text)
|
||||||
|
|
||||||
|
|
||||||
|
def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
||||||
|
playwright = sync_playwright().start()
|
||||||
|
browser = playwright.chromium.launch(headless=True)
|
||||||
|
|
||||||
|
context = browser.new_context()
|
||||||
|
|
||||||
|
if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
|
||||||
|
client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
|
||||||
|
oauth = OAuth2Session(client=client)
|
||||||
|
token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
|
||||||
|
client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
|
||||||
|
client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
|
||||||
|
context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
|
||||||
|
|
||||||
|
return playwright, context
|
||||||
|
|
||||||
|
|
||||||
class WebConnector(LoadConnector):
|
class WebConnector(LoadConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -73,7 +157,8 @@ class WebConnector(LoadConnector):
|
|||||||
to_visit: list[str] = [self.base_url]
|
to_visit: list[str] = [self.base_url]
|
||||||
doc_batch: list[Document] = []
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
restart_playwright = True
|
playwright, context = start_playwright()
|
||||||
|
restart_playwright = False
|
||||||
while to_visit:
|
while to_visit:
|
||||||
current_url = to_visit.pop()
|
current_url = to_visit.pop()
|
||||||
if current_url in visited_links:
|
if current_url in visited_links:
|
||||||
@@ -86,9 +171,7 @@ class WebConnector(LoadConnector):
|
|||||||
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
|
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
|
||||||
|
|
||||||
if restart_playwright:
|
if restart_playwright:
|
||||||
playwright = sync_playwright().start()
|
playwright, context = start_playwright()
|
||||||
browser = playwright.chromium.launch(headless=True)
|
|
||||||
context = browser.new_context()
|
|
||||||
restart_playwright = False
|
restart_playwright = False
|
||||||
|
|
||||||
if current_url.split(".")[-1] == "pdf":
|
if current_url.split(".")[-1] == "pdf":
|
||||||
@@ -133,27 +216,21 @@ class WebConnector(LoadConnector):
|
|||||||
title = None
|
title = None
|
||||||
if title_tag and title_tag.text:
|
if title_tag and title_tag.text:
|
||||||
title = title_tag.text
|
title = title_tag.text
|
||||||
|
title_tag.extract()
|
||||||
|
|
||||||
# Heuristics based cleaning
|
# Heuristics based cleaning of elements based on css classes
|
||||||
for undesired_div in ["sidebar", "header", "footer"]:
|
for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
|
||||||
[
|
[
|
||||||
tag.extract()
|
tag.extract()
|
||||||
for tag in soup.find_all(
|
for tag in soup.find_all(
|
||||||
"div", class_=lambda x: x and undesired_div in x.split()
|
class_=lambda x: x and undesired_element in x.split()
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
for undesired_tag in [
|
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||||
"nav",
|
|
||||||
"header",
|
|
||||||
"footer",
|
|
||||||
"meta",
|
|
||||||
"script",
|
|
||||||
"style",
|
|
||||||
]:
|
|
||||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
page_text = soup.get_text(HTML_SEPARATOR)
|
page_text = format_document(soup)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
Reference in New Issue
Block a user