From 0896d3b7dabee45004188a74696cc45a86121a50 Mon Sep 17 00:00:00 2001 From: Art Matsak Date: Sun, 8 Sep 2024 20:27:14 +0200 Subject: [PATCH] Fix content extraction from JIRA with API v2 vs. v3 (#1678) --- .../connectors/danswer_jira/connector.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index c1d04977e..ec5c35054 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -45,10 +45,15 @@ def extract_jira_project(url: str) -> tuple[str, str]: return jira_base, jira_project -def extract_text_from_content(content: dict) -> str: +def extract_text_from_adf(adf: dict | None) -> str: + """Extracts plain text from Atlassian Document Format: + https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/ + + WARNING: This function is incomplete and will e.g. skip lists! + """ texts = [] - if "content" in content: - for block in content["content"]: + if adf is not None and "content" in adf: + for block in adf["content"]: if "content" in block: for item in block["content"]: if item["type"] == "text": @@ -72,15 +77,8 @@ def _get_comment_strs( comment_strs = [] for comment in jira.fields.comment.comments: try: - if hasattr(comment, "body"): - body_text = extract_text_from_content(comment.raw["body"]) - elif hasattr(comment, "raw"): - body = comment.raw.get("body", "No body content available") - body_text = ( - extract_text_from_content(body) if isinstance(body, dict) else body - ) - else: - body_text = "No body attribute found" + body_text = (comment.body if JIRA_API_VERSION == "2" + else extract_text_from_adf(comment.raw["body"])) if ( hasattr(comment, "author") @@ -127,11 +125,11 @@ def fetch_jira_issues_batch( ) continue + description = (jira.fields.description if JIRA_API_VERSION == "2" + else extract_text_from_adf(jira.raw["fields"]["description"])) comments = _get_comment_strs(jira, comment_email_blacklist) - semantic_rep = ( - f"{jira.fields.description}\n" - if jira.fields.description - else "" + "\n".join([f"Comment: {comment}" for comment in comments]) + semantic_rep = f"{description}\n" + "\n".join( + [f"Comment: {comment}" for comment in comments if comment] ) page_url = f"{jira_client.client_info()}/browse/{jira.key}"