From b8af38bb95a75a709f75c88aee3bc0c196aecf56 Mon Sep 17 00:00:00 2001 From: Shravan Vishwanathan <48054120+shravanv90@users.noreply.github.com> Date: Sat, 11 May 2024 23:11:27 +0530 Subject: [PATCH] Refactor comment extraction in JIRA connector to handle nested content (#1329) - Implement `extract_text_from_content` to parse nested text elements from comment bodies. - Modify `_get_comment_strs` to use the new text extraction method, improving handling of various content structures. --- .../connectors/danswer_jira/connector.py | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index dfed7ebd1..64923d1a6 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -44,24 +44,37 @@ def extract_jira_project(url: str) -> tuple[str, str]: return jira_base, jira_project +def extract_text_from_content(content): + texts = [] + if 'content' in content: + for block in content['content']: + if 'content' in block: + for item in block['content']: + if item['type'] == 'text': + texts.append(item['text']) + return " ".join(texts) -def _get_comment_strs( - jira: Issue, - comment_email_blacklist: tuple[str, ...] = (), -) -> list[str]: +def _get_comment_strs(jira: Issue, comment_email_blacklist: tuple[str, ...] = ()) -> list[str]: comment_strs = [] for comment in jira.fields.comment.comments: - # Can't test Jira server so can't be sure this works for everyone, wrapping in a try just - # in case try: - comment_strs.append(comment.body) - # If this fails, we just assume it's ok to keep the comment - if comment.author.emailAddress in comment_email_blacklist: - comment_strs.pop() - except Exception: - pass - return comment_strs + if hasattr(comment, 'body'): + body_text = extract_text_from_content(comment.raw['body']) + elif hasattr(comment, 'raw'): + body = comment.raw.get('body', 'No body content available') + body_text = extract_text_from_content(body) if isinstance(body, dict) else body + else: + body_text = "No body attribute found" + + if hasattr(comment, 'author') and comment.author.emailAddress in comment_email_blacklist: + continue # Skip adding comment if author's email is in blacklist + + comment_strs.append(body_text) + except Exception as e: + logger.error(f"Failed to process comment due to an error: {e}") + continue + return comment_strs def fetch_jira_issues_batch( jql: str,