mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 01:31:51 +01:00
Added support for page within a page in Confluence (#3125)
This commit is contained in:
parent
d703e694ce
commit
60471b6a73
@ -100,6 +100,39 @@ def extract_text_from_confluence_html(
|
||||
continue
|
||||
# Include @ sign for tagging, more clear for LLM
|
||||
user.replaceWith("@" + _get_user(confluence_client, user_id))
|
||||
|
||||
for html_page_reference in soup.findAll("ri:page"):
|
||||
# Wrap this in a try-except because there are some pages that might not exist
|
||||
try:
|
||||
page_title = html_page_reference.attrs["ri:content-title"]
|
||||
if not page_title:
|
||||
continue
|
||||
|
||||
page_query = f"type=page and title='{page_title}'"
|
||||
|
||||
page_contents: dict[str, Any] | None = None
|
||||
# Confluence enforces title uniqueness, so we should only get one result here
|
||||
for page_batch in confluence_client.paginated_cql_page_retrieval(
|
||||
cql=page_query,
|
||||
expand="body.storage.value",
|
||||
limit=1,
|
||||
):
|
||||
page_contents = page_batch[0]
|
||||
break
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"Error getting page contents for object {confluence_object}"
|
||||
)
|
||||
continue
|
||||
|
||||
if not page_contents:
|
||||
continue
|
||||
text_from_page = extract_text_from_confluence_html(
|
||||
confluence_client, page_contents
|
||||
)
|
||||
|
||||
html_page_reference.replaceWith(text_from_page)
|
||||
|
||||
return format_document_soup(soup)
|
||||
|
||||
|
||||
|
@ -39,24 +39,39 @@ def test_confluence_connector_basic(
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
|
||||
assert len(doc_batch) == 2
|
||||
assert len(doc_batch) == 3
|
||||
|
||||
for doc in doc_batch:
|
||||
if doc.semantic_identifier == "DailyConnectorTestSpace Home":
|
||||
page_doc = doc
|
||||
elif ".txt" in doc.semantic_identifier:
|
||||
txt_doc = doc
|
||||
elif doc.semantic_identifier == "Page Within A Page":
|
||||
page_within_a_page_doc = doc
|
||||
|
||||
assert page_within_a_page_doc.semantic_identifier == "Page Within A Page"
|
||||
assert page_within_a_page_doc.primary_owners
|
||||
assert page_within_a_page_doc.primary_owners[0].email == "hagen@danswer.ai"
|
||||
assert len(page_within_a_page_doc.sections) == 1
|
||||
|
||||
page_within_a_page_section = page_within_a_page_doc.sections[0]
|
||||
page_within_a_page_text = "@Chris Weaver loves cherry pie"
|
||||
assert page_within_a_page_section.text == page_within_a_page_text
|
||||
assert (
|
||||
page_within_a_page_section.link
|
||||
== "https://danswerai.atlassian.net/wiki/spaces/DailyConne/pages/200769540/Page+Within+A+Page"
|
||||
)
|
||||
|
||||
assert page_doc.semantic_identifier == "DailyConnectorTestSpace Home"
|
||||
assert page_doc.metadata["labels"] == ["testlabel"]
|
||||
assert page_doc.primary_owners
|
||||
assert page_doc.primary_owners[0].email == "chris@danswer.ai"
|
||||
assert page_doc.primary_owners[0].email == "hagen@danswer.ai"
|
||||
assert len(page_doc.sections) == 1
|
||||
|
||||
section = page_doc.sections[0]
|
||||
assert section.text == "test123"
|
||||
page_section = page_doc.sections[0]
|
||||
assert page_section.text == "test123 " + page_within_a_page_text
|
||||
assert (
|
||||
section.link
|
||||
page_section.link
|
||||
== "https://danswerai.atlassian.net/wiki/spaces/DailyConne/overview"
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user