mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-23 11:26:03 +02:00
fix(confluence): ignore empty pages (#349)
This commit is contained in:
@@ -111,7 +111,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Batch failed with space {self.space} at offset {start_ind}"
|
f"Batch failed with space {self.space} at offset {start_ind} with size {batch_size}, processing pages individually..."
|
||||||
)
|
)
|
||||||
|
|
||||||
view_pages: list[dict[str, Any]] = []
|
view_pages: list[dict[str, Any]] = []
|
||||||
@@ -127,7 +127,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
expand="body.storage.value,version",
|
expand="body.storage.value,version",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
except:
|
except HTTPError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Page failed with space {self.space} at offset {start_ind + i}, trying alternative expand option: {e}"
|
||||||
|
)
|
||||||
# Use view instead, which captures most info but is less complete
|
# Use view instead, which captures most info but is less complete
|
||||||
view_pages.extend(
|
view_pages.extend(
|
||||||
confluence_client.get_all_pages_from_space(
|
confluence_client.get_all_pages_from_space(
|
||||||
@@ -195,16 +198,20 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if time_filter is None or time_filter(last_modified):
|
if time_filter is None or time_filter(last_modified):
|
||||||
page_html = (
|
page_html = (
|
||||||
page["body"].get("storage", {}).get("value")
|
page["body"]
|
||||||
or page["body"]["view"]["value"]
|
.get("storage", page["body"].get("view", {}))
|
||||||
|
.get("value")
|
||||||
)
|
)
|
||||||
|
page_url = self.wiki_base + page["_links"]["webui"]
|
||||||
|
if not page_html:
|
||||||
|
logger.debug("Page is empty, skipping: %s", page_url)
|
||||||
|
continue
|
||||||
page_text = (
|
page_text = (
|
||||||
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
||||||
)
|
)
|
||||||
comments_text = self._fetch_comments(self.confluence_client, page["id"])
|
comments_text = self._fetch_comments(self.confluence_client, page["id"])
|
||||||
page_text += comments_text
|
page_text += comments_text
|
||||||
|
|
||||||
page_url = self.wiki_base + page["_links"]["webui"]
|
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
Reference in New Issue
Block a user