Improve Web Connector Output, Add Config Options and add OAuth Backend Flow (#199)

2025-09-29 05:15:12 +02:00 · 2023-07-29 21:21:23 +02:00
parent b6b549357f
commit 0d7d54fddb
2 changed files with 104 additions and 20 deletions
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -90,6 +90,13 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
 FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
    "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
 )
 WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
                                               "sidebar,header,footer").split(",")
 WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
                                                "nav,header,footer,meta,script,style,symbol,aside").split(",")
 WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
 WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
 WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
 #####
 # Query Configs
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,22 +1,33 @@
 import io
 import re
 from datetime import datetime
 from typing import Any
 from typing import cast
 from typing import Tuple
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import bs4
 import requests
 from bs4 import BeautifulSoup
-from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from playwright.sync_api import BrowserContext
 from playwright.sync_api import Playwright
 from playwright.sync_api import sync_playwright
 from PyPDF2 import PdfReader
 from oauthlib.oauth2 import BackendApplicationClient
 from requests_oauthlib import OAuth2Session
 from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
    WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import HTML_SEPARATOR
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
 from playwright.sync_api import sync_playwright
 from PyPDF2 import PdfReader
 logger = setup_logger()
@@ -50,6 +61,79 @@ def get_internal_links(
    return internal_links
 def strip_excessive_newlines_and_spaces(document: str) -> str:
    # collapse repeated spaces into one
    document = re.sub(r" +", " ", document)
    # remove trailing spaces
    document = re.sub(r" +[\n\r]", "\n", document)
    # remove repeated newlines
    document = re.sub(r"[\n\r]+", "\n", document)
    return document.strip()
 def strip_newlines(document: str) -> str:
    # HTML might contain newlines which are just whitespaces to a browser
    return re.sub(r"[\n\r]+", " ", document)
 def format_document(document: BeautifulSoup) -> str:
    """Format html to a flat text document.
    The following goals:
    - Newlines from within the HTML are removed (as browser would ignore them as well).
    - Repeated newlines/spaces are removed (as browsers would ignore them).
    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
    - Table columns/rows are separated by newline
    - List elements are separated by newline and start with a hyphen
    """
    text = ""
    list_element_start = False
    verbatim_output = 0
    for e in document.descendants:
        verbatim_output -= 1
        if isinstance(e, bs4.element.NavigableString):
            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
                continue
            element_text = e.text
            if element_text:
                if verbatim_output > 0:
                    text += element_text
                else:
                    text += strip_newlines(element_text)
                list_element_start = False
        elif isinstance(e, bs4.element.Tag):
            if e.name in ["p", "div"]:
                if not list_element_start:
                    text += "\n"
            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
                text += "\n"
                list_element_start = False
            elif e.name == "li":
                text += "\n- "
                list_element_start = True
            elif e.name == "pre":
                if verbatim_output <= 0:
                    verbatim_output = len(list(e.childGenerator()))
    return strip_excessive_newlines_and_spaces(text)
 def start_playwright() -> Tuple[Playwright, BrowserContext]:
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=True)
    context = browser.new_context()
    if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
        client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
        oauth = OAuth2Session(client=client)
        token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
                                  client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
                                  client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
        context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
    return playwright, context
 class WebConnector(LoadConnector):
    def __init__(
        self,
@@ -73,7 +157,8 @@ class WebConnector(LoadConnector):
        to_visit: list[str] = [self.base_url]
        doc_batch: list[Document] = []
-        restart_playwright = True
+        playwright, context = start_playwright()
        restart_playwright = False
        while to_visit:
            current_url = to_visit.pop()
            if current_url in visited_links:
@@ -86,9 +171,7 @@ class WebConnector(LoadConnector):
                current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
                if restart_playwright:
-                    playwright = sync_playwright().start()
+                    playwright, context = start_playwright()
                    browser = playwright.chromium.launch(headless=True)
                    context = browser.new_context()
                    restart_playwright = False
                if current_url.split(".")[-1] == "pdf":
@@ -133,27 +216,21 @@ class WebConnector(LoadConnector):
                title = None
                if title_tag and title_tag.text:
                    title = title_tag.text
                    title_tag.extract()
-                # Heuristics based cleaning
+                # Heuristics based cleaning of elements based on css classes
-                for undesired_div in ["sidebar", "header", "footer"]:
+                for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
                    [
                        tag.extract()
                        for tag in soup.find_all(
-                            "div", class_=lambda x: x and undesired_div in x.split()
+                            class_=lambda x: x and undesired_element in x.split()
                        )
                    ]
-                for undesired_tag in [
+                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                    "nav",
                    "header",
                    "footer",
                    "meta",
                    "script",
                    "style",
                ]:
                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-                page_text = soup.get_text(HTML_SEPARATOR)
+                page_text = format_document(soup)
                doc_batch.append(
                    Document(