Fix content extraction from JIRA with API v2 vs. v3 (#1678)

2025-09-21 06:10:19 +02:00 · 2024-09-08 20:27:14 +02:00
parent 87b27046bd
commit 0896d3b7da
1 changed files with 14 additions and 16 deletions
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -45,10 +45,15 @@ def extract_jira_project(url: str) -> tuple[str, str]:
    return jira_base, jira_project


-def extract_text_from_content(content: dict) -> str:
+def extract_text_from_adf(adf: dict | None) -> str:
+    """Extracts plain text from Atlassian Document Format:
+    https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
+
+    WARNING: This function is incomplete and will e.g. skip lists!
+    """
    texts = []
-    if "content" in content:
-        for block in content["content"]:
+    if adf is not None and "content" in adf:
+        for block in adf["content"]:
            if "content" in block:
                for item in block["content"]:
                    if item["type"] == "text":
@@ -72,15 +77,8 @@ def _get_comment_strs(
    comment_strs = []
    for comment in jira.fields.comment.comments:
        try:
-            if hasattr(comment, "body"):
-                body_text = extract_text_from_content(comment.raw["body"])
-            elif hasattr(comment, "raw"):
-                body = comment.raw.get("body", "No body content available")
-                body_text = (
-                    extract_text_from_content(body) if isinstance(body, dict) else body
-                )
-            else:
-                body_text = "No body attribute found"
+            body_text = (comment.body if JIRA_API_VERSION == "2"
+                         else extract_text_from_adf(comment.raw["body"]))

            if (
                hasattr(comment, "author")
@@ -127,11 +125,11 @@ def fetch_jira_issues_batch(
            )
            continue

+        description = (jira.fields.description if JIRA_API_VERSION == "2"
+                       else extract_text_from_adf(jira.raw["fields"]["description"]))
        comments = _get_comment_strs(jira, comment_email_blacklist)
-        semantic_rep = (
-            f"{jira.fields.description}\n"
-            if jira.fields.description
-            else "" + "\n".join([f"Comment: {comment}" for comment in comments])
+        semantic_rep = f"{description}\n" + "\n".join(
+            [f"Comment: {comment}" for comment in comments if comment]
        )

        page_url = f"{jira_client.client_info()}/browse/{jira.key}"