danswer/backend/onyx/connectors/onyx_jira/connector.py

import os
from collections.abc import Iterable
from datetime import datetime
from datetime import timezone
from typing import Any

from jira import JIRA
from jira.resources import Issue

from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP
from onyx.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.onyx_jira.utils import best_effort_basic_expert_info
from onyx.connectors.onyx_jira.utils import best_effort_get_field_from_issue
from onyx.connectors.onyx_jira.utils import build_jira_client
from onyx.connectors.onyx_jira.utils import build_jira_url
from onyx.connectors.onyx_jira.utils import extract_jira_project
from onyx.connectors.onyx_jira.utils import extract_text_from_adf
from onyx.connectors.onyx_jira.utils import get_comment_strs
from onyx.utils.logger import setup_logger


logger = setup_logger()

JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
_JIRA_SLIM_PAGE_SIZE = 500
_JIRA_FULL_PAGE_SIZE = 50


def _paginate_jql_search(
    jira_client: JIRA,
    jql: str,
    max_results: int,
    fields: str | None = None,
) -> Iterable[Issue]:
    start = 0
    while True:
        logger.debug(
            f"Fetching Jira issues with JQL: {jql}, "
            f"starting at {start}, max results: {max_results}"
        )
        issues = jira_client.search_issues(
            jql_str=jql,
            startAt=start,
            maxResults=max_results,
            fields=fields,
        )

        for issue in issues:
            if isinstance(issue, Issue):
                yield issue
            else:
                raise Exception(f"Found Jira object not of type Issue: {issue}")

        if len(issues) < max_results:
            break

        start += max_results


def fetch_jira_issues_batch(
    jira_client: JIRA,
    jql: str,
    batch_size: int,
    comment_email_blacklist: tuple[str, ...] = (),
    labels_to_skip: set[str] | None = None,
) -> Iterable[Document]:
    for issue in _paginate_jql_search(
        jira_client=jira_client,
        jql=jql,
        max_results=batch_size,
    ):
        if labels_to_skip:
            if any(label in issue.fields.labels for label in labels_to_skip):
                logger.info(
                    f"Skipping {issue.key} because it has a label to skip. Found "
                    f"labels: {issue.fields.labels}. Labels to skip: {labels_to_skip}."
                )
                continue

        description = (
            issue.fields.description
            if JIRA_API_VERSION == "2"
            else extract_text_from_adf(issue.raw["fields"]["description"])
        )
        comments = get_comment_strs(
            issue=issue,
            comment_email_blacklist=comment_email_blacklist,
        )
        ticket_content = f"{description}\n" + "\n".join(
            [f"Comment: {comment}" for comment in comments if comment]
        )

        # Check ticket size
        if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE:
            logger.info(
                f"Skipping {issue.key} because it exceeds the maximum size of "
                f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes."
            )
            continue

        page_url = f"{jira_client.client_info()}/browse/{issue.key}"

        people = set()
        try:
            creator = best_effort_get_field_from_issue(issue, "creator")
            if basic_expert_info := best_effort_basic_expert_info(creator):
                people.add(basic_expert_info)
        except Exception:
            # Author should exist but if not, doesn't matter
            pass

        try:
            assignee = best_effort_get_field_from_issue(issue, "assignee")
            if basic_expert_info := best_effort_basic_expert_info(assignee):
                people.add(basic_expert_info)
        except Exception:
            # Author should exist but if not, doesn't matter
            pass

        metadata_dict = {}
        if priority := best_effort_get_field_from_issue(issue, "priority"):
            metadata_dict["priority"] = priority.name
        if status := best_effort_get_field_from_issue(issue, "status"):
            metadata_dict["status"] = status.name
        if resolution := best_effort_get_field_from_issue(issue, "resolution"):
            metadata_dict["resolution"] = resolution.name
        if labels := best_effort_get_field_from_issue(issue, "labels"):
            metadata_dict["label"] = labels

        yield Document(
            id=page_url,
            sections=[Section(link=page_url, text=ticket_content)],
            source=DocumentSource.JIRA,
            semantic_identifier=issue.fields.summary,
            doc_updated_at=time_str_to_utc(issue.fields.updated),
            primary_owners=list(people) or None,
            # TODO add secondary_owners (commenters) if needed
            metadata=metadata_dict,
        )


class JiraConnector(LoadConnector, PollConnector, SlimConnector):
    def __init__(
        self,
        jira_project_url: str,
        comment_email_blacklist: list[str] | None = None,
        batch_size: int = INDEX_BATCH_SIZE,
        # if a ticket has one of the labels specified in this list, we will just
        # skip it. This is generally used to avoid indexing extra sensitive
        # tickets.
        labels_to_skip: list[str] = JIRA_CONNECTOR_LABELS_TO_SKIP,
    ) -> None:
        self.batch_size = batch_size
        self.jira_base, self._jira_project = extract_jira_project(jira_project_url)
        self._jira_client: JIRA | None = None
        self._comment_email_blacklist = comment_email_blacklist or []

        self.labels_to_skip = set(labels_to_skip)

    @property
    def comment_email_blacklist(self) -> tuple:
        return tuple(email.strip() for email in self._comment_email_blacklist)

    @property
    def jira_client(self) -> JIRA:
        if self._jira_client is None:
            raise ConnectorMissingCredentialError("Jira")
        return self._jira_client

    @property
    def quoted_jira_project(self) -> str:
        # Quote the project name to handle reserved words
        return f'"{self._jira_project}"'

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        self._jira_client = build_jira_client(
            credentials=credentials,
            jira_base=self.jira_base,
        )
        return None

    def load_from_state(self) -> GenerateDocumentsOutput:
        jql = f"project = {self.quoted_jira_project}"

        document_batch = []
        for doc in fetch_jira_issues_batch(
            jira_client=self.jira_client,
            jql=jql,
            batch_size=_JIRA_FULL_PAGE_SIZE,
            comment_email_blacklist=self.comment_email_blacklist,
            labels_to_skip=self.labels_to_skip,
        ):
            document_batch.append(doc)
            if len(document_batch) >= self.batch_size:
                yield document_batch
                document_batch = []

        yield document_batch

    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
        start_date_str = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
            "%Y-%m-%d %H:%M"
        )
        end_date_str = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
            "%Y-%m-%d %H:%M"
        )

        jql = (
            f"project = {self.quoted_jira_project} AND "
            f"updated >= '{start_date_str}' AND "
            f"updated <= '{end_date_str}'"
        )

        document_batch = []
        for doc in fetch_jira_issues_batch(
            jira_client=self.jira_client,
            jql=jql,
            batch_size=_JIRA_FULL_PAGE_SIZE,
            comment_email_blacklist=self.comment_email_blacklist,
            labels_to_skip=self.labels_to_skip,
        ):
            document_batch.append(doc)
            if len(document_batch) >= self.batch_size:
                yield document_batch
                document_batch = []

        yield document_batch

    def retrieve_all_slim_documents(
        self,
        start: SecondsSinceUnixEpoch | None = None,
        end: SecondsSinceUnixEpoch | None = None,
    ) -> GenerateSlimDocumentOutput:
        jql = f"project = {self.quoted_jira_project}"

        slim_doc_batch = []
        for issue in _paginate_jql_search(
            jira_client=self.jira_client,
            jql=jql,
            max_results=_JIRA_SLIM_PAGE_SIZE,
            fields="key",
        ):
            issue_key = best_effort_get_field_from_issue(issue, "key")
            id = build_jira_url(self.jira_client, issue_key)
            slim_doc_batch.append(
                SlimDocument(
                    id=id,
                    perm_sync_data=None,
                )
            )
            if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE:
                yield slim_doc_batch
                slim_doc_batch = []

        yield slim_doc_batch


if __name__ == "__main__":
    import os

    connector = JiraConnector(
        os.environ["JIRA_PROJECT_URL"], comment_email_blacklist=[]
    )
    connector.load_credentials(
        {
            "jira_user_email": os.environ["JIRA_USER_EMAIL"],
            "jira_api_token": os.environ["JIRA_API_TOKEN"],
        }
    )
    document_batches = connector.load_from_state()
    print(next(document_batches))