mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-18 15:50:13 +02:00
Confluence handle pages without body.storage.value (#347)
Workaround for: https://jira.atlassian.com/browse/CONFCLOUD-76433
This commit is contained in:
parent
b2a51283d1
commit
548f0a41cb
@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users"
|
|||||||
ALLOWED_GROUPS = "allowed_groups"
|
ALLOWED_GROUPS = "allowed_groups"
|
||||||
METADATA = "metadata"
|
METADATA = "metadata"
|
||||||
GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
|
GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
|
||||||
HTML_SEPARATOR = "\n"
|
|
||||||
PUBLIC_DOC_PAT = "PUBLIC"
|
PUBLIC_DOC_PAT = "PUBLIC"
|
||||||
QUOTE = "quote"
|
QUOTE = "quote"
|
||||||
BOOST = "boost"
|
BOOST = "boost"
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
start_ind: int,
|
start_ind: int,
|
||||||
) -> Collection[dict[str, Any]]:
|
) -> Collection[dict[str, Any]]:
|
||||||
def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
|
def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
|
||||||
|
try:
|
||||||
return confluence_client.get_all_pages_from_space(
|
return confluence_client.get_all_pages_from_space(
|
||||||
self.space,
|
self.space,
|
||||||
start=start_ind,
|
start=start_ind,
|
||||||
limit=batch_size,
|
limit=batch_size,
|
||||||
expand="body.storage.value,version",
|
expand="body.storage.value,version",
|
||||||
)
|
)
|
||||||
|
except:
|
||||||
|
logger.warning(
|
||||||
|
f"Batch failed with space {self.space} at offset {start_ind}"
|
||||||
|
)
|
||||||
|
|
||||||
|
view_pages: list[dict[str, Any]] = []
|
||||||
|
for i in range(self.batch_size):
|
||||||
|
try:
|
||||||
|
# Could be that one of the pages here failed due to this bug:
|
||||||
|
# https://jira.atlassian.com/browse/CONFCLOUD-76433
|
||||||
|
view_pages.extend(
|
||||||
|
confluence_client.get_all_pages_from_space(
|
||||||
|
self.space,
|
||||||
|
start=start_ind + i,
|
||||||
|
limit=1,
|
||||||
|
expand="body.storage.value,version",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
# Use view instead, which captures most info but is less complete
|
||||||
|
view_pages.extend(
|
||||||
|
confluence_client.get_all_pages_from_space(
|
||||||
|
self.space,
|
||||||
|
start=start_ind + i,
|
||||||
|
limit=1,
|
||||||
|
expand="body.view.value,version",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return view_pages
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return _fetch(start_ind, self.batch_size)
|
return _fetch(start_ind, self.batch_size)
|
||||||
@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
last_modified = datetime.fromisoformat(last_modified_str)
|
last_modified = datetime.fromisoformat(last_modified_str)
|
||||||
|
|
||||||
if time_filter is None or time_filter(last_modified):
|
if time_filter is None or time_filter(last_modified):
|
||||||
page_html = page["body"]["storage"]["value"]
|
page_html = (
|
||||||
|
page["body"].get("storage", {}).get("value")
|
||||||
|
or page["body"]["view"]["value"]
|
||||||
|
)
|
||||||
page_text = (
|
page_text = (
|
||||||
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
||||||
)
|
)
|
||||||
@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if num_pages < self.batch_size:
|
if num_pages < self.batch_size:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
||||||
|
connector.load_credentials(
|
||||||
|
{
|
||||||
|
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
||||||
|
"confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
document_batches = connector.load_from_state()
|
||||||
|
print(next(document_batches))
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import io
|
import io
|
||||||
import re
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@ -7,7 +6,6 @@ from typing import Tuple
|
|||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import bs4
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from oauthlib.oauth2 import BackendApplicationClient
|
from oauthlib.oauth2 import BackendApplicationClient
|
||||||
@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector
|
|||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
from danswer.utils.text_processing import format_document_soup
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
@ -62,62 +61,6 @@ def get_internal_links(
|
|||||||
return internal_links
|
return internal_links
|
||||||
|
|
||||||
|
|
||||||
def strip_excessive_newlines_and_spaces(document: str) -> str:
|
|
||||||
# collapse repeated spaces into one
|
|
||||||
document = re.sub(r" +", " ", document)
|
|
||||||
# remove trailing spaces
|
|
||||||
document = re.sub(r" +[\n\r]", "\n", document)
|
|
||||||
# remove repeated newlines
|
|
||||||
document = re.sub(r"[\n\r]+", "\n", document)
|
|
||||||
return document.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def strip_newlines(document: str) -> str:
|
|
||||||
# HTML might contain newlines which are just whitespaces to a browser
|
|
||||||
return re.sub(r"[\n\r]+", " ", document)
|
|
||||||
|
|
||||||
|
|
||||||
def format_document(document: BeautifulSoup) -> str:
|
|
||||||
"""Format html to a flat text document.
|
|
||||||
|
|
||||||
The following goals:
|
|
||||||
- Newlines from within the HTML are removed (as browser would ignore them as well).
|
|
||||||
- Repeated newlines/spaces are removed (as browsers would ignore them).
|
|
||||||
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
|
|
||||||
- Table columns/rows are separated by newline
|
|
||||||
- List elements are separated by newline and start with a hyphen
|
|
||||||
"""
|
|
||||||
text = ""
|
|
||||||
list_element_start = False
|
|
||||||
verbatim_output = 0
|
|
||||||
for e in document.descendants:
|
|
||||||
verbatim_output -= 1
|
|
||||||
if isinstance(e, bs4.element.NavigableString):
|
|
||||||
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
|
|
||||||
continue
|
|
||||||
element_text = e.text
|
|
||||||
if element_text:
|
|
||||||
if verbatim_output > 0:
|
|
||||||
text += element_text
|
|
||||||
else:
|
|
||||||
text += strip_newlines(element_text)
|
|
||||||
list_element_start = False
|
|
||||||
elif isinstance(e, bs4.element.Tag):
|
|
||||||
if e.name in ["p", "div"]:
|
|
||||||
if not list_element_start:
|
|
||||||
text += "\n"
|
|
||||||
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
|
||||||
text += "\n"
|
|
||||||
list_element_start = False
|
|
||||||
elif e.name == "li":
|
|
||||||
text += "\n- "
|
|
||||||
list_element_start = True
|
|
||||||
elif e.name == "pre":
|
|
||||||
if verbatim_output <= 0:
|
|
||||||
verbatim_output = len(list(e.childGenerator()))
|
|
||||||
return strip_excessive_newlines_and_spaces(text)
|
|
||||||
|
|
||||||
|
|
||||||
def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
||||||
playwright = sync_playwright().start()
|
playwright = sync_playwright().start()
|
||||||
browser = playwright.chromium.launch(headless=True)
|
browser = playwright.chromium.launch(headless=True)
|
||||||
@ -239,7 +182,7 @@ class WebConnector(LoadConnector):
|
|||||||
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
page_text = format_document(soup)
|
page_text = format_document_soup(soup)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
@ -267,3 +210,9 @@ class WebConnector(LoadConnector):
|
|||||||
if doc_batch:
|
if doc_batch:
|
||||||
playwright.stop()
|
playwright.stop()
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
connector = WebConnector("https://docs.danswer.dev/")
|
||||||
|
document_batches = connector.load_from_state()
|
||||||
|
print(next(document_batches))
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
|
import bs4
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from danswer.configs.constants import HTML_SEPARATOR
|
|
||||||
|
|
||||||
|
|
||||||
def clean_model_quote(quote: str, trim_length: int) -> str:
|
def clean_model_quote(quote: str, trim_length: int) -> str:
|
||||||
quote_clean = quote.strip()
|
quote_clean = quote.strip()
|
||||||
@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def strip_excessive_newlines_and_spaces(document: str) -> str:
|
||||||
|
# collapse repeated spaces into one
|
||||||
|
document = re.sub(r" +", " ", document)
|
||||||
|
# remove trailing spaces
|
||||||
|
document = re.sub(r" +[\n\r]", "\n", document)
|
||||||
|
# remove repeated newlines
|
||||||
|
document = re.sub(r"[\n\r]+", "\n", document)
|
||||||
|
return document.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_newlines(document: str) -> str:
|
||||||
|
# HTML might contain newlines which are just whitespaces to a browser
|
||||||
|
return re.sub(r"[\n\r]+", " ", document)
|
||||||
|
|
||||||
|
|
||||||
|
def format_document_soup(document: BeautifulSoup) -> str:
|
||||||
|
"""Format html to a flat text document.
|
||||||
|
|
||||||
|
The following goals:
|
||||||
|
- Newlines from within the HTML are removed (as browser would ignore them as well).
|
||||||
|
- Repeated newlines/spaces are removed (as browsers would ignore them).
|
||||||
|
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
|
||||||
|
- Table columns/rows are separated by newline
|
||||||
|
- List elements are separated by newline and start with a hyphen
|
||||||
|
"""
|
||||||
|
text = ""
|
||||||
|
list_element_start = False
|
||||||
|
verbatim_output = 0
|
||||||
|
for e in document.descendants:
|
||||||
|
verbatim_output -= 1
|
||||||
|
if isinstance(e, bs4.element.NavigableString):
|
||||||
|
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
|
||||||
|
continue
|
||||||
|
element_text = e.text
|
||||||
|
if element_text:
|
||||||
|
if verbatim_output > 0:
|
||||||
|
text += element_text
|
||||||
|
else:
|
||||||
|
text += strip_newlines(element_text)
|
||||||
|
list_element_start = False
|
||||||
|
elif isinstance(e, bs4.element.Tag):
|
||||||
|
if e.name in ["p", "div"]:
|
||||||
|
if not list_element_start:
|
||||||
|
text += "\n"
|
||||||
|
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
||||||
|
text += "\n"
|
||||||
|
list_element_start = False
|
||||||
|
elif e.name == "li":
|
||||||
|
text += "\n- "
|
||||||
|
list_element_start = True
|
||||||
|
elif e.name == "pre":
|
||||||
|
if verbatim_output <= 0:
|
||||||
|
verbatim_output = len(list(e.childGenerator()))
|
||||||
|
return strip_excessive_newlines_and_spaces(text)
|
||||||
|
|
||||||
|
|
||||||
def parse_html_page_basic(text: str) -> str:
|
def parse_html_page_basic(text: str) -> str:
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
soup = BeautifulSoup(text, "html.parser")
|
||||||
return soup.get_text(HTML_SEPARATOR)
|
return format_document_soup(soup)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user