welcome to onyx

2025-09-19 12:03:54 +02:00 · 2024-12-13 09:48:43 -08:00
parent 54dcbfa288
commit 21ec5ed795
813 changed files with 7021 additions and 6824 deletions
--- a/backend/onyx/connectors/onyx_jira/init.py
+++ b/backend/onyx/connectors/onyx_jira/init.py
--- a/backend/onyx/connectors/onyx_jira/connector.py
+++ b/backend/onyx/connectors/onyx_jira/connector.py
@@ -0,0 +1,286 @@
+import os
+from collections.abc import Iterable
+from datetime import datetime
+from datetime import timezone
+from typing import Any
+
+from jira import JIRA
+from jira.resources import Issue
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP
+from onyx.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import Section
+from onyx.connectors.models import SlimDocument
+from onyx.connectors.onyx_jira.utils import best_effort_basic_expert_info
+from onyx.connectors.onyx_jira.utils import best_effort_get_field_from_issue
+from onyx.connectors.onyx_jira.utils import build_jira_client
+from onyx.connectors.onyx_jira.utils import build_jira_url
+from onyx.connectors.onyx_jira.utils import extract_jira_project
+from onyx.connectors.onyx_jira.utils import extract_text_from_adf
+from onyx.connectors.onyx_jira.utils import get_comment_strs
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
+_JIRA_SLIM_PAGE_SIZE = 500
+_JIRA_FULL_PAGE_SIZE = 50
+
+
+def _paginate_jql_search(
+    jira_client: JIRA,
+    jql: str,
+    max_results: int,
+    fields: str | None = None,
+) -> Iterable[Issue]:
+    start = 0
+    while True:
+        logger.debug(
+            f"Fetching Jira issues with JQL: {jql}, "
+            f"starting at {start}, max results: {max_results}"
+        )
+        issues = jira_client.search_issues(
+            jql_str=jql,
+            startAt=start,
+            maxResults=max_results,
+            fields=fields,
+        )
+
+        for issue in issues:
+            if isinstance(issue, Issue):
+                yield issue
+            else:
+                raise Exception(f"Found Jira object not of type Issue: {issue}")
+
+        if len(issues) < max_results:
+            break
+
+        start += max_results
+
+
+def fetch_jira_issues_batch(
+    jira_client: JIRA,
+    jql: str,
+    batch_size: int,
+    comment_email_blacklist: tuple[str, ...] = (),
+    labels_to_skip: set[str] | None = None,
+) -> Iterable[Document]:
+    for issue in _paginate_jql_search(
+        jira_client=jira_client,
+        jql=jql,
+        max_results=batch_size,
+    ):
+        if labels_to_skip:
+            if any(label in issue.fields.labels for label in labels_to_skip):
+                logger.info(
+                    f"Skipping {issue.key} because it has a label to skip. Found "
+                    f"labels: {issue.fields.labels}. Labels to skip: {labels_to_skip}."
+                )
+                continue
+
+        description = (
+            issue.fields.description
+            if JIRA_API_VERSION == "2"
+            else extract_text_from_adf(issue.raw["fields"]["description"])
+        )
+        comments = get_comment_strs(
+            issue=issue,
+            comment_email_blacklist=comment_email_blacklist,
+        )
+        ticket_content = f"{description}\n" + "\n".join(
+            [f"Comment: {comment}" for comment in comments if comment]
+        )
+
+        # Check ticket size
+        if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE:
+            logger.info(
+                f"Skipping {issue.key} because it exceeds the maximum size of "
+                f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes."
+            )
+            continue
+
+        page_url = f"{jira_client.client_info()}/browse/{issue.key}"
+
+        people = set()
+        try:
+            creator = best_effort_get_field_from_issue(issue, "creator")
+            if basic_expert_info := best_effort_basic_expert_info(creator):
+                people.add(basic_expert_info)
+        except Exception:
+            # Author should exist but if not, doesn't matter
+            pass
+
+        try:
+            assignee = best_effort_get_field_from_issue(issue, "assignee")
+            if basic_expert_info := best_effort_basic_expert_info(assignee):
+                people.add(basic_expert_info)
+        except Exception:
+            # Author should exist but if not, doesn't matter
+            pass
+
+        metadata_dict = {}
+        if priority := best_effort_get_field_from_issue(issue, "priority"):
+            metadata_dict["priority"] = priority.name
+        if status := best_effort_get_field_from_issue(issue, "status"):
+            metadata_dict["status"] = status.name
+        if resolution := best_effort_get_field_from_issue(issue, "resolution"):
+            metadata_dict["resolution"] = resolution.name
+        if labels := best_effort_get_field_from_issue(issue, "labels"):
+            metadata_dict["label"] = labels
+
+        yield Document(
+            id=page_url,
+            sections=[Section(link=page_url, text=ticket_content)],
+            source=DocumentSource.JIRA,
+            semantic_identifier=issue.fields.summary,
+            doc_updated_at=time_str_to_utc(issue.fields.updated),
+            primary_owners=list(people) or None,
+            # TODO add secondary_owners (commenters) if needed
+            metadata=metadata_dict,
+        )
+
+
+class JiraConnector(LoadConnector, PollConnector, SlimConnector):
+    def __init__(
+        self,
+        jira_project_url: str,
+        comment_email_blacklist: list[str] | None = None,
+        batch_size: int = INDEX_BATCH_SIZE,
+        # if a ticket has one of the labels specified in this list, we will just
+        # skip it. This is generally used to avoid indexing extra sensitive
+        # tickets.
+        labels_to_skip: list[str] = JIRA_CONNECTOR_LABELS_TO_SKIP,
+    ) -> None:
+        self.batch_size = batch_size
+        self.jira_base, self._jira_project = extract_jira_project(jira_project_url)
+        self._jira_client: JIRA | None = None
+        self._comment_email_blacklist = comment_email_blacklist or []
+
+        self.labels_to_skip = set(labels_to_skip)
+
+    @property
+    def comment_email_blacklist(self) -> tuple:
+        return tuple(email.strip() for email in self._comment_email_blacklist)
+
+    @property
+    def jira_client(self) -> JIRA:
+        if self._jira_client is None:
+            raise ConnectorMissingCredentialError("Jira")
+        return self._jira_client
+
+    @property
+    def quoted_jira_project(self) -> str:
+        # Quote the project name to handle reserved words
+        return f'"{self._jira_project}"'
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        self._jira_client = build_jira_client(
+            credentials=credentials,
+            jira_base=self.jira_base,
+        )
+        return None
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        jql = f"project = {self.quoted_jira_project}"
+
+        document_batch = []
+        for doc in fetch_jira_issues_batch(
+            jira_client=self.jira_client,
+            jql=jql,
+            batch_size=_JIRA_FULL_PAGE_SIZE,
+            comment_email_blacklist=self.comment_email_blacklist,
+            labels_to_skip=self.labels_to_skip,
+        ):
+            document_batch.append(doc)
+            if len(document_batch) >= self.batch_size:
+                yield document_batch
+                document_batch = []
+
+        yield document_batch
+
+    def poll_source(
+        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
+    ) -> GenerateDocumentsOutput:
+        start_date_str = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
+            "%Y-%m-%d %H:%M"
+        )
+        end_date_str = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
+            "%Y-%m-%d %H:%M"
+        )
+
+        jql = (
+            f"project = {self.quoted_jira_project} AND "
+            f"updated >= '{start_date_str}' AND "
+            f"updated <= '{end_date_str}'"
+        )
+
+        document_batch = []
+        for doc in fetch_jira_issues_batch(
+            jira_client=self.jira_client,
+            jql=jql,
+            batch_size=_JIRA_FULL_PAGE_SIZE,
+            comment_email_blacklist=self.comment_email_blacklist,
+            labels_to_skip=self.labels_to_skip,
+        ):
+            document_batch.append(doc)
+            if len(document_batch) >= self.batch_size:
+                yield document_batch
+                document_batch = []
+
+        yield document_batch
+
+    def retrieve_all_slim_documents(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> GenerateSlimDocumentOutput:
+        jql = f"project = {self.quoted_jira_project}"
+
+        slim_doc_batch = []
+        for issue in _paginate_jql_search(
+            jira_client=self.jira_client,
+            jql=jql,
+            max_results=_JIRA_SLIM_PAGE_SIZE,
+            fields="key",
+        ):
+            issue_key = best_effort_get_field_from_issue(issue, "key")
+            id = build_jira_url(self.jira_client, issue_key)
+            slim_doc_batch.append(
+                SlimDocument(
+                    id=id,
+                    perm_sync_data=None,
+                )
+            )
+            if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE:
+                yield slim_doc_batch
+                slim_doc_batch = []
+
+        yield slim_doc_batch
+
+
+if __name__ == "__main__":
+    import os
+
+    connector = JiraConnector(
+        os.environ["JIRA_PROJECT_URL"], comment_email_blacklist=[]
+    )
+    connector.load_credentials(
+        {
+            "jira_user_email": os.environ["JIRA_USER_EMAIL"],
+            "jira_api_token": os.environ["JIRA_API_TOKEN"],
+        }
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/onyx/connectors/onyx_jira/utils.py
+++ b/backend/onyx/connectors/onyx_jira/utils.py
@@ -0,0 +1,211 @@
+"""Module with custom fields processing functions"""
+import os
+from typing import Any
+from typing import List
+from urllib.parse import urlparse
+
+from jira import JIRA
+from jira.resources import CustomFieldOption
+from jira.resources import Issue
+from jira.resources import User
+
+from onyx.connectors.models import BasicExpertInfo
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+PROJECT_URL_PAT = "projects"
+JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
+
+
+def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None:
+    display_name = None
+    email = None
+    if hasattr(obj, "display_name"):
+        display_name = obj.display_name
+    else:
+        display_name = obj.get("displayName")
+
+    if hasattr(obj, "emailAddress"):
+        email = obj.emailAddress
+    else:
+        email = obj.get("emailAddress")
+
+    if not email and not display_name:
+        return None
+
+    return BasicExpertInfo(display_name=display_name, email=email)
+
+
+def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
+    if hasattr(jira_issue.fields, field):
+        return getattr(jira_issue.fields, field)
+
+    try:
+        return jira_issue.raw["fields"][field]
+    except Exception:
+        return None
+
+
+def extract_text_from_adf(adf: dict | None) -> str:
+    """Extracts plain text from Atlassian Document Format:
+    https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
+
+    WARNING: This function is incomplete and will e.g. skip lists!
+    """
+    texts = []
+    if adf is not None and "content" in adf:
+        for block in adf["content"]:
+            if "content" in block:
+                for item in block["content"]:
+                    if item["type"] == "text":
+                        texts.append(item["text"])
+    return " ".join(texts)
+
+
+def build_jira_url(jira_client: JIRA, issue_key: str) -> str:
+    return f"{jira_client.client_info()}/browse/{issue_key}"
+
+
+def build_jira_client(credentials: dict[str, Any], jira_base: str) -> JIRA:
+    api_token = credentials["jira_api_token"]
+    # if user provide an email we assume it's cloud
+    if "jira_user_email" in credentials:
+        email = credentials["jira_user_email"]
+        return JIRA(
+            basic_auth=(email, api_token),
+            server=jira_base,
+            options={"rest_api_version": JIRA_API_VERSION},
+        )
+    else:
+        return JIRA(
+            token_auth=api_token,
+            server=jira_base,
+            options={"rest_api_version": JIRA_API_VERSION},
+        )
+
+
+def extract_jira_project(url: str) -> tuple[str, str]:
+    parsed_url = urlparse(url)
+    jira_base = parsed_url.scheme + "://" + parsed_url.netloc
+
+    # Split the path by '/' and find the position of 'projects' to get the project name
+    split_path = parsed_url.path.split("/")
+    if PROJECT_URL_PAT in split_path:
+        project_pos = split_path.index(PROJECT_URL_PAT)
+        if len(split_path) > project_pos + 1:
+            jira_project = split_path[project_pos + 1]
+        else:
+            raise ValueError("No project name found in the URL")
+    else:
+        raise ValueError("'projects' not found in the URL")
+
+    return jira_base, jira_project
+
+
+def get_comment_strs(
+    issue: Issue, comment_email_blacklist: tuple[str, ...] = ()
+) -> list[str]:
+    comment_strs = []
+    for comment in issue.fields.comment.comments:
+        try:
+            body_text = (
+                comment.body
+                if JIRA_API_VERSION == "2"
+                else extract_text_from_adf(comment.raw["body"])
+            )
+
+            if (
+                hasattr(comment, "author")
+                and hasattr(comment.author, "emailAddress")
+                and comment.author.emailAddress in comment_email_blacklist
+            ):
+                continue  # Skip adding comment if author's email is in blacklist
+
+            comment_strs.append(body_text)
+        except Exception as e:
+            logger.error(f"Failed to process comment due to an error: {e}")
+            continue
+
+    return comment_strs
+
+
+class CustomFieldExtractor:
+    @staticmethod
+    def _process_custom_field_value(value: Any) -> str:
+        """
+        Process a custom field value to a string
+        """
+        try:
+            if isinstance(value, str):
+                return value
+            elif isinstance(value, CustomFieldOption):
+                return value.value
+            elif isinstance(value, User):
+                return value.displayName
+            elif isinstance(value, List):
+                return " ".join(
+                    [CustomFieldExtractor._process_custom_field_value(v) for v in value]
+                )
+            else:
+                return str(value)
+        except Exception as e:
+            logger.error(f"Error processing custom field value {value}: {e}")
+            return ""
+
+    @staticmethod
+    def get_issue_custom_fields(
+        jira: Issue, custom_fields: dict, max_value_length: int = 250
+    ) -> dict:
+        """
+        Process all custom fields of an issue to a dictionary of strings
+        :param jira: jira_issue, bug or similar
+        :param custom_fields: custom fields dictionary
+        :param max_value_length: maximum length of the value to be processed, if exceeded, it will be truncated
+        """
+
+        issue_custom_fields = {
+            custom_fields[key]: value
+            for key, value in jira.fields.__dict__.items()
+            if value and key in custom_fields.keys()
+        }
+
+        processed_fields = {}
+
+        if issue_custom_fields:
+            for key, value in issue_custom_fields.items():
+                processed = CustomFieldExtractor._process_custom_field_value(value)
+                # We need max length  parameter, because there are some plugins that often has very long description
+                # and there is just a technical information so we just avoid long values
+                if len(processed) < max_value_length:
+                    processed_fields[key] = processed
+
+        return processed_fields
+
+    @staticmethod
+    def get_all_custom_fields(jira_client: JIRA) -> dict:
+        """Get all custom fields from Jira"""
+        fields = jira_client.fields()
+        fields_dct = {
+            field["id"]: field["name"] for field in fields if field["custom"] is True
+        }
+        return fields_dct
+
+
+class CommonFieldExtractor:
+    @staticmethod
+    def get_issue_common_fields(jira: Issue) -> dict:
+        return {
+            "Priority": jira.fields.priority.name if jira.fields.priority else None,
+            "Reporter": jira.fields.reporter.displayName
+            if jira.fields.reporter
+            else None,
+            "Assignee": jira.fields.assignee.displayName
+            if jira.fields.assignee
+            else None,
+            "Status": jira.fields.status.name if jira.fields.status else None,
+            "Resolution": jira.fields.resolution.name
+            if jira.fields.resolution
+            else None,
+        }