fix(confluence): ignore empty pages (#349)

This commit is contained in:
Patrick Decat
2023-08-29 18:45:35 +02:00
committed by GitHub
parent 548f0a41cb
commit 681a8a423f

View File

@@ -111,7 +111,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
) )
except: except:
logger.warning( logger.warning(
f"Batch failed with space {self.space} at offset {start_ind}" f"Batch failed with space {self.space} at offset {start_ind} with size {batch_size}, processing pages individually..."
) )
view_pages: list[dict[str, Any]] = [] view_pages: list[dict[str, Any]] = []
@@ -127,7 +127,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
expand="body.storage.value,version", expand="body.storage.value,version",
) )
) )
except: except HTTPError as e:
logger.warning(
f"Page failed with space {self.space} at offset {start_ind + i}, trying alternative expand option: {e}"
)
# Use view instead, which captures most info but is less complete # Use view instead, which captures most info but is less complete
view_pages.extend( view_pages.extend(
confluence_client.get_all_pages_from_space( confluence_client.get_all_pages_from_space(
@@ -195,16 +198,20 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if time_filter is None or time_filter(last_modified): if time_filter is None or time_filter(last_modified):
page_html = ( page_html = (
page["body"].get("storage", {}).get("value") page["body"]
or page["body"]["view"]["value"] .get("storage", page["body"].get("view", {}))
.get("value")
) )
page_url = self.wiki_base + page["_links"]["webui"]
if not page_html:
logger.debug("Page is empty, skipping: %s", page_url)
continue
page_text = ( page_text = (
page.get("title", "") + "\n" + parse_html_page_basic(page_html) page.get("title", "") + "\n" + parse_html_page_basic(page_html)
) )
comments_text = self._fetch_comments(self.confluence_client, page["id"]) comments_text = self._fetch_comments(self.confluence_client, page["id"])
page_text += comments_text page_text += comments_text
page_url = self.wiki_base + page["_links"]["webui"]
doc_batch.append( doc_batch.append(
Document( Document(