From 0d7d54fddb6ce94c0c6084b19105ca53a71701a0 Mon Sep 17 00:00:00 2001
From: jabdoa2 <jabdoa2@users.noreply.github.com>
Date: Sat, 29 Jul 2023 21:21:23 +0200
Subject: [PATCH] Improve Web Connector Output, Add Config Options and add
 OAuth Backend Flow (#199)

---
 backend/danswer/configs/app_configs.py      |   7 ++
 backend/danswer/connectors/web/connector.py | 117 ++++++++++++++++----
 2 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
index 8ee836b120b8..8a1c6dd127e4 100644
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -90,6 +90,13 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
 FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
     "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
 )
+WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get("WEB_CONNECTOR_IGNORED_CLASSES",
+                                               "sidebar,header,footer").split(",")
+WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get("WEB_CONNECTOR_IGNORED_ELEMENTS",
+                                                "nav,header,footer,meta,script,style,symbol,aside").split(",")
+WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID", False)
+WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET", False)
+WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL", False)
 
 #####
 # Query Configs
diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py
index 38941fb1a919..37a8c80d4a15 100644
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,22 +1,33 @@
 import io
+import re
 from datetime import datetime
 from typing import Any
 from typing import cast
+from typing import Tuple
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 
+import bs4
 import requests
 from bs4 import BeautifulSoup
-from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from playwright.sync_api import BrowserContext
+from playwright.sync_api import Playwright
+from playwright.sync_api import sync_playwright
+from PyPDF2 import PdfReader
+
+from oauthlib.oauth2 import BackendApplicationClient
+from requests_oauthlib import OAuth2Session
+
+from danswer.configs.app_configs import INDEX_BATCH_SIZE, WEB_CONNECTOR_OAUTH_CLIENT_ID, \
+    WEB_CONNECTOR_OAUTH_CLIENT_SECRET, WEB_CONNECTOR_OAUTH_TOKEN_URL
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.configs.constants import DocumentSource
-from danswer.configs.constants import HTML_SEPARATOR
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from playwright.sync_api import sync_playwright
-from PyPDF2 import PdfReader
 
 logger = setup_logger()
 
@@ -50,6 +61,79 @@ def get_internal_links(
     return internal_links
 
 
+def strip_excessive_newlines_and_spaces(document: str) -> str:
+    # collapse repeated spaces into one
+    document = re.sub(r" +", " ", document)
+    # remove trailing spaces
+    document = re.sub(r" +[\n\r]", "\n", document)
+    # remove repeated newlines
+    document = re.sub(r"[\n\r]+", "\n", document)
+    return document.strip()
+
+
+def strip_newlines(document: str) -> str:
+    # HTML might contain newlines which are just whitespaces to a browser
+    return re.sub(r"[\n\r]+", " ", document)
+
+
+def format_document(document: BeautifulSoup) -> str:
+    """Format html to a flat text document.
+
+    The following goals:
+    - Newlines from within the HTML are removed (as browser would ignore them as well).
+    - Repeated newlines/spaces are removed (as browsers would ignore them).
+    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
+    - Table columns/rows are separated by newline
+    - List elements are separated by newline and start with a hyphen
+    """
+    text = ""
+    list_element_start = False
+    verbatim_output = 0
+    for e in document.descendants:
+        verbatim_output -= 1
+        if isinstance(e, bs4.element.NavigableString):
+            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
+                continue
+            element_text = e.text
+            if element_text:
+                if verbatim_output > 0:
+                    text += element_text
+                else:
+                    text += strip_newlines(element_text)
+                list_element_start = False
+        elif isinstance(e, bs4.element.Tag):
+            if e.name in ["p", "div"]:
+                if not list_element_start:
+                    text += "\n"
+            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+                text += "\n"
+                list_element_start = False
+            elif e.name == "li":
+                text += "\n- "
+                list_element_start = True
+            elif e.name == "pre":
+                if verbatim_output <= 0:
+                    verbatim_output = len(list(e.childGenerator()))
+    return strip_excessive_newlines_and_spaces(text)
+
+
+def start_playwright() -> Tuple[Playwright, BrowserContext]:
+    playwright = sync_playwright().start()
+    browser = playwright.chromium.launch(headless=True)
+
+    context = browser.new_context()
+
+    if WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET and WEB_CONNECTOR_OAUTH_TOKEN_URL:
+        client = BackendApplicationClient(client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID)
+        oauth = OAuth2Session(client=client)
+        token = oauth.fetch_token(token_url=WEB_CONNECTOR_OAUTH_TOKEN_URL,
+                                  client_id=WEB_CONNECTOR_OAUTH_CLIENT_ID,
+                                  client_secret=WEB_CONNECTOR_OAUTH_CLIENT_SECRET)
+        context.set_extra_http_headers({"Authorization": "Bearer {}".format(token["access_token"])})
+
+    return playwright, context
+
+
 class WebConnector(LoadConnector):
     def __init__(
         self,
@@ -73,7 +157,8 @@ class WebConnector(LoadConnector):
         to_visit: list[str] = [self.base_url]
         doc_batch: list[Document] = []
 
-        restart_playwright = True
+        playwright, context = start_playwright()
+        restart_playwright = False
         while to_visit:
             current_url = to_visit.pop()
             if current_url in visited_links:
@@ -86,9 +171,7 @@ class WebConnector(LoadConnector):
                 current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
 
                 if restart_playwright:
-                    playwright = sync_playwright().start()
-                    browser = playwright.chromium.launch(headless=True)
-                    context = browser.new_context()
+                    playwright, context = start_playwright()
                     restart_playwright = False
 
                 if current_url.split(".")[-1] == "pdf":
@@ -133,27 +216,21 @@ class WebConnector(LoadConnector):
                 title = None
                 if title_tag and title_tag.text:
                     title = title_tag.text
+                    title_tag.extract()
 
-                # Heuristics based cleaning
-                for undesired_div in ["sidebar", "header", "footer"]:
+                # Heuristics based cleaning of elements based on css classes
+                for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
                     [
                         tag.extract()
                         for tag in soup.find_all(
-                            "div", class_=lambda x: x and undesired_div in x.split()
+                            class_=lambda x: x and undesired_element in x.split()
                         )
                     ]
 
-                for undesired_tag in [
-                    "nav",
-                    "header",
-                    "footer",
-                    "meta",
-                    "script",
-                    "style",
-                ]:
+                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                     [tag.extract() for tag in soup.find_all(undesired_tag)]
 
-                page_text = soup.get_text(HTML_SEPARATOR)
+                page_text = format_document(soup)
 
                 doc_batch.append(
                     Document(