Added Slim connector for Jira (#3181)

* Added Slim connector for Jira

* fixed testing

* more cleanup of Jira connector

* cleanup
This commit is contained in:
hagen-danswer 2024-11-21 09:00:20 -08:00 committed by GitHub
parent 70207b4b39
commit 100b4a0d16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 307 additions and 231 deletions

View File

@ -1,8 +1,8 @@
import os
from collections.abc import Iterable
from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import urlparse
from jira import JIRA
from jira.resources import Issue
@ -12,129 +12,93 @@ from danswer.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.danswer_jira.utils import best_effort_basic_expert_info
from danswer.connectors.danswer_jira.utils import best_effort_get_field_from_issue
from danswer.connectors.danswer_jira.utils import build_jira_client
from danswer.connectors.danswer_jira.utils import build_jira_url
from danswer.connectors.danswer_jira.utils import extract_jira_project
from danswer.connectors.danswer_jira.utils import extract_text_from_adf
from danswer.connectors.danswer_jira.utils import get_comment_strs
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.interfaces import SlimConnector
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.models import SlimDocument
from danswer.utils.logger import setup_logger
logger = setup_logger()
PROJECT_URL_PAT = "projects"
JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
_JIRA_SLIM_PAGE_SIZE = 500
_JIRA_FULL_PAGE_SIZE = 50
def extract_jira_project(url: str) -> tuple[str, str]:
parsed_url = urlparse(url)
jira_base = parsed_url.scheme + "://" + parsed_url.netloc
def _paginate_jql_search(
jira_client: JIRA,
jql: str,
max_results: int,
fields: str | None = None,
) -> Iterable[Issue]:
start = 0
while True:
logger.debug(
f"Fetching Jira issues with JQL: {jql}, "
f"starting at {start}, max results: {max_results}"
)
issues = jira_client.search_issues(
jql_str=jql,
startAt=start,
maxResults=max_results,
fields=fields,
)
# Split the path by '/' and find the position of 'projects' to get the project name
split_path = parsed_url.path.split("/")
if PROJECT_URL_PAT in split_path:
project_pos = split_path.index(PROJECT_URL_PAT)
if len(split_path) > project_pos + 1:
jira_project = split_path[project_pos + 1]
else:
raise ValueError("No project name found in the URL")
else:
raise ValueError("'projects' not found in the URL")
for issue in issues:
if isinstance(issue, Issue):
yield issue
else:
raise Exception(f"Found Jira object not of type Issue: {issue}")
return jira_base, jira_project
if len(issues) < max_results:
break
def extract_text_from_adf(adf: dict | None) -> str:
"""Extracts plain text from Atlassian Document Format:
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
WARNING: This function is incomplete and will e.g. skip lists!
"""
texts = []
if adf is not None and "content" in adf:
for block in adf["content"]:
if "content" in block:
for item in block["content"]:
if item["type"] == "text":
texts.append(item["text"])
return " ".join(texts)
def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
if hasattr(jira_issue.fields, field):
return getattr(jira_issue.fields, field)
try:
return jira_issue.raw["fields"][field]
except Exception:
return None
def _get_comment_strs(
jira: Issue, comment_email_blacklist: tuple[str, ...] = ()
) -> list[str]:
comment_strs = []
for comment in jira.fields.comment.comments:
try:
body_text = (
comment.body
if JIRA_API_VERSION == "2"
else extract_text_from_adf(comment.raw["body"])
)
if (
hasattr(comment, "author")
and hasattr(comment.author, "emailAddress")
and comment.author.emailAddress in comment_email_blacklist
):
continue # Skip adding comment if author's email is in blacklist
comment_strs.append(body_text)
except Exception as e:
logger.error(f"Failed to process comment due to an error: {e}")
continue
return comment_strs
start += max_results
def fetch_jira_issues_batch(
jql: str,
start_index: int,
jira_client: JIRA,
batch_size: int = INDEX_BATCH_SIZE,
jql: str,
batch_size: int,
comment_email_blacklist: tuple[str, ...] = (),
labels_to_skip: set[str] | None = None,
) -> tuple[list[Document], int]:
doc_batch = []
batch = jira_client.search_issues(
jql,
startAt=start_index,
maxResults=batch_size,
)
for jira in batch:
if type(jira) != Issue:
logger.warning(f"Found Jira object not of type Issue {jira}")
continue
if labels_to_skip and any(
label in jira.fields.labels for label in labels_to_skip
):
logger.info(
f"Skipping {jira.key} because it has a label to skip. Found "
f"labels: {jira.fields.labels}. Labels to skip: {labels_to_skip}."
)
continue
) -> Iterable[Document]:
for issue in _paginate_jql_search(
jira_client=jira_client,
jql=jql,
max_results=batch_size,
):
if labels_to_skip:
if any(label in issue.fields.labels for label in labels_to_skip):
logger.info(
f"Skipping {issue.key} because it has a label to skip. Found "
f"labels: {issue.fields.labels}. Labels to skip: {labels_to_skip}."
)
continue
description = (
jira.fields.description
issue.fields.description
if JIRA_API_VERSION == "2"
else extract_text_from_adf(jira.raw["fields"]["description"])
else extract_text_from_adf(issue.raw["fields"]["description"])
)
comments = get_comment_strs(
issue=issue,
comment_email_blacklist=comment_email_blacklist,
)
comments = _get_comment_strs(jira, comment_email_blacklist)
ticket_content = f"{description}\n" + "\n".join(
[f"Comment: {comment}" for comment in comments if comment]
)
@ -142,66 +106,53 @@ def fetch_jira_issues_batch(
# Check ticket size
if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE:
logger.info(
f"Skipping {jira.key} because it exceeds the maximum size of "
f"Skipping {issue.key} because it exceeds the maximum size of "
f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes."
)
continue
page_url = f"{jira_client.client_info()}/browse/{jira.key}"
page_url = f"{jira_client.client_info()}/browse/{issue.key}"
people = set()
try:
people.add(
BasicExpertInfo(
display_name=jira.fields.creator.displayName,
email=jira.fields.creator.emailAddress,
)
)
creator = best_effort_get_field_from_issue(issue, "creator")
if basic_expert_info := best_effort_basic_expert_info(creator):
people.add(basic_expert_info)
except Exception:
# Author should exist but if not, doesn't matter
pass
try:
people.add(
BasicExpertInfo(
display_name=jira.fields.assignee.displayName, # type: ignore
email=jira.fields.assignee.emailAddress, # type: ignore
)
)
assignee = best_effort_get_field_from_issue(issue, "assignee")
if basic_expert_info := best_effort_basic_expert_info(assignee):
people.add(basic_expert_info)
except Exception:
# Author should exist but if not, doesn't matter
pass
metadata_dict = {}
priority = best_effort_get_field_from_issue(jira, "priority")
if priority:
if priority := best_effort_get_field_from_issue(issue, "priority"):
metadata_dict["priority"] = priority.name
status = best_effort_get_field_from_issue(jira, "status")
if status:
if status := best_effort_get_field_from_issue(issue, "status"):
metadata_dict["status"] = status.name
resolution = best_effort_get_field_from_issue(jira, "resolution")
if resolution:
if resolution := best_effort_get_field_from_issue(issue, "resolution"):
metadata_dict["resolution"] = resolution.name
labels = best_effort_get_field_from_issue(jira, "labels")
if labels:
if labels := best_effort_get_field_from_issue(issue, "labels"):
metadata_dict["label"] = labels
doc_batch.append(
Document(
id=page_url,
sections=[Section(link=page_url, text=ticket_content)],
source=DocumentSource.JIRA,
semantic_identifier=jira.fields.summary,
doc_updated_at=time_str_to_utc(jira.fields.updated),
primary_owners=list(people) or None,
# TODO add secondary_owners (commenters) if needed
metadata=metadata_dict,
)
yield Document(
id=page_url,
sections=[Section(link=page_url, text=ticket_content)],
source=DocumentSource.JIRA,
semantic_identifier=issue.fields.summary,
doc_updated_at=time_str_to_utc(issue.fields.updated),
primary_owners=list(people) or None,
# TODO add secondary_owners (commenters) if needed
metadata=metadata_dict,
)
return doc_batch, len(batch)
class JiraConnector(LoadConnector, PollConnector):
class JiraConnector(LoadConnector, PollConnector, SlimConnector):
def __init__(
self,
jira_project_url: str,
@ -213,8 +164,8 @@ class JiraConnector(LoadConnector, PollConnector):
labels_to_skip: list[str] = JIRA_CONNECTOR_LABELS_TO_SKIP,
) -> None:
self.batch_size = batch_size
self.jira_base, self.jira_project = extract_jira_project(jira_project_url)
self.jira_client: JIRA | None = None
self.jira_base, self._jira_project = extract_jira_project(jira_project_url)
self._jira_client: JIRA | None = None
self._comment_email_blacklist = comment_email_blacklist or []
self.labels_to_skip = set(labels_to_skip)
@ -223,54 +174,45 @@ class JiraConnector(LoadConnector, PollConnector):
def comment_email_blacklist(self) -> tuple:
return tuple(email.strip() for email in self._comment_email_blacklist)
@property
def jira_client(self) -> JIRA:
if self._jira_client is None:
raise ConnectorMissingCredentialError("Jira")
return self._jira_client
@property
def quoted_jira_project(self) -> str:
# Quote the project name to handle reserved words
return f'"{self._jira_project}"'
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
api_token = credentials["jira_api_token"]
# if user provide an email we assume it's cloud
if "jira_user_email" in credentials:
email = credentials["jira_user_email"]
self.jira_client = JIRA(
basic_auth=(email, api_token),
server=self.jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
else:
self.jira_client = JIRA(
token_auth=api_token,
server=self.jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
self._jira_client = build_jira_client(
credentials=credentials,
jira_base=self.jira_base,
)
return None
def load_from_state(self) -> GenerateDocumentsOutput:
if self.jira_client is None:
raise ConnectorMissingCredentialError("Jira")
jql = f"project = {self.quoted_jira_project}"
# Quote the project name to handle reserved words
quoted_project = f'"{self.jira_project}"'
start_ind = 0
while True:
doc_batch, fetched_batch_size = fetch_jira_issues_batch(
jql=f"project = {quoted_project}",
start_index=start_ind,
jira_client=self.jira_client,
batch_size=self.batch_size,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
)
document_batch = []
for doc in fetch_jira_issues_batch(
jira_client=self.jira_client,
jql=jql,
batch_size=_JIRA_FULL_PAGE_SIZE,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
):
document_batch.append(doc)
if len(document_batch) >= self.batch_size:
yield document_batch
document_batch = []
if doc_batch:
yield doc_batch
start_ind += fetched_batch_size
if fetched_batch_size < self.batch_size:
break
yield document_batch
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if self.jira_client is None:
raise ConnectorMissingCredentialError("Jira")
start_date_str = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
"%Y-%m-%d %H:%M"
)
@ -278,31 +220,54 @@ class JiraConnector(LoadConnector, PollConnector):
"%Y-%m-%d %H:%M"
)
# Quote the project name to handle reserved words
quoted_project = f'"{self.jira_project}"'
jql = (
f"project = {quoted_project} AND "
f"project = {self.quoted_jira_project} AND "
f"updated >= '{start_date_str}' AND "
f"updated <= '{end_date_str}'"
)
start_ind = 0
while True:
doc_batch, fetched_batch_size = fetch_jira_issues_batch(
jql=jql,
start_index=start_ind,
jira_client=self.jira_client,
batch_size=self.batch_size,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
document_batch = []
for doc in fetch_jira_issues_batch(
jira_client=self.jira_client,
jql=jql,
batch_size=_JIRA_FULL_PAGE_SIZE,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
):
document_batch.append(doc)
if len(document_batch) >= self.batch_size:
yield document_batch
document_batch = []
yield document_batch
def retrieve_all_slim_documents(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateSlimDocumentOutput:
jql = f"project = {self.quoted_jira_project}"
slim_doc_batch = []
for issue in _paginate_jql_search(
jira_client=self.jira_client,
jql=jql,
max_results=_JIRA_SLIM_PAGE_SIZE,
fields="key",
):
issue_key = best_effort_get_field_from_issue(issue, "key")
id = build_jira_url(self.jira_client, issue_key)
slim_doc_batch.append(
SlimDocument(
id=id,
perm_sync_data=None,
)
)
if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE:
yield slim_doc_batch
slim_doc_batch = []
if doc_batch:
yield doc_batch
start_ind += fetched_batch_size
if fetched_batch_size < self.batch_size:
break
yield slim_doc_batch
if __name__ == "__main__":

View File

@ -1,17 +1,136 @@
"""Module with custom fields processing functions"""
import os
from typing import Any
from typing import List
from urllib.parse import urlparse
from jira import JIRA
from jira.resources import CustomFieldOption
from jira.resources import Issue
from jira.resources import User
from danswer.connectors.models import BasicExpertInfo
from danswer.utils.logger import setup_logger
logger = setup_logger()
PROJECT_URL_PAT = "projects"
JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None:
display_name = None
email = None
if hasattr(obj, "display_name"):
display_name = obj.display_name
else:
display_name = obj.get("displayName")
if hasattr(obj, "emailAddress"):
email = obj.emailAddress
else:
email = obj.get("emailAddress")
if not email and not display_name:
return None
return BasicExpertInfo(display_name=display_name, email=email)
def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
if hasattr(jira_issue.fields, field):
return getattr(jira_issue.fields, field)
try:
return jira_issue.raw["fields"][field]
except Exception:
return None
def extract_text_from_adf(adf: dict | None) -> str:
"""Extracts plain text from Atlassian Document Format:
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
WARNING: This function is incomplete and will e.g. skip lists!
"""
texts = []
if adf is not None and "content" in adf:
for block in adf["content"]:
if "content" in block:
for item in block["content"]:
if item["type"] == "text":
texts.append(item["text"])
return " ".join(texts)
def build_jira_url(jira_client: JIRA, issue_key: str) -> str:
return f"{jira_client.client_info()}/browse/{issue_key}"
def build_jira_client(credentials: dict[str, Any], jira_base: str) -> JIRA:
api_token = credentials["jira_api_token"]
# if user provide an email we assume it's cloud
if "jira_user_email" in credentials:
email = credentials["jira_user_email"]
return JIRA(
basic_auth=(email, api_token),
server=jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
else:
return JIRA(
token_auth=api_token,
server=jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
def extract_jira_project(url: str) -> tuple[str, str]:
parsed_url = urlparse(url)
jira_base = parsed_url.scheme + "://" + parsed_url.netloc
# Split the path by '/' and find the position of 'projects' to get the project name
split_path = parsed_url.path.split("/")
if PROJECT_URL_PAT in split_path:
project_pos = split_path.index(PROJECT_URL_PAT)
if len(split_path) > project_pos + 1:
jira_project = split_path[project_pos + 1]
else:
raise ValueError("No project name found in the URL")
else:
raise ValueError("'projects' not found in the URL")
return jira_base, jira_project
def get_comment_strs(
issue: Issue, comment_email_blacklist: tuple[str, ...] = ()
) -> list[str]:
comment_strs = []
for comment in issue.fields.comment.comments:
try:
body_text = (
comment.body
if JIRA_API_VERSION == "2"
else extract_text_from_adf(comment.raw["body"])
)
if (
hasattr(comment, "author")
and hasattr(comment.author, "emailAddress")
and comment.author.emailAddress in comment_email_blacklist
):
continue # Skip adding comment if author's email is in blacklist
comment_strs.append(body_text)
except Exception as e:
logger.error(f"Failed to process comment due to an error: {e}")
continue
return comment_strs
class CustomFieldExtractor:
@staticmethod
def _process_custom_field_value(value: Any) -> str:

View File

@ -1,4 +1,3 @@
from collections.abc import Callable
from collections.abc import Generator
from typing import Any
from unittest.mock import MagicMock
@ -18,49 +17,48 @@ def mock_jira_client() -> MagicMock:
@pytest.fixture
def mock_issue_small() -> MagicMock:
issue = MagicMock()
issue.key = "SMALL-1"
issue.fields.description = "Small description"
issue.fields.comment.comments = [
issue = MagicMock(spec=Issue)
fields = MagicMock()
fields.description = "Small description"
fields.comment = MagicMock()
fields.comment.comments = [
MagicMock(body="Small comment 1"),
MagicMock(body="Small comment 2"),
]
issue.fields.creator.displayName = "John Doe"
issue.fields.creator.emailAddress = "john@example.com"
issue.fields.summary = "Small Issue"
issue.fields.updated = "2023-01-01T00:00:00+0000"
issue.fields.labels = []
fields.creator = MagicMock()
fields.creator.displayName = "John Doe"
fields.creator.emailAddress = "john@example.com"
fields.summary = "Small Issue"
fields.updated = "2023-01-01T00:00:00+0000"
fields.labels = []
issue.fields = fields
issue.key = "SMALL-1"
return issue
@pytest.fixture
def mock_issue_large() -> MagicMock:
# This will be larger than 100KB
issue = MagicMock()
issue.key = "LARGE-1"
issue.fields.description = "a" * 99_000
issue.fields.comment.comments = [
issue = MagicMock(spec=Issue)
fields = MagicMock()
fields.description = "a" * 99_000
fields.comment = MagicMock()
fields.comment.comments = [
MagicMock(body="Large comment " * 1000),
MagicMock(body="Another large comment " * 1000),
]
issue.fields.creator.displayName = "Jane Doe"
issue.fields.creator.emailAddress = "jane@example.com"
issue.fields.summary = "Large Issue"
issue.fields.updated = "2023-01-02T00:00:00+0000"
issue.fields.labels = []
fields.creator = MagicMock()
fields.creator.displayName = "Jane Doe"
fields.creator.emailAddress = "jane@example.com"
fields.summary = "Large Issue"
fields.updated = "2023-01-02T00:00:00+0000"
fields.labels = []
issue.fields = fields
issue.key = "LARGE-1"
return issue
@pytest.fixture
def patched_type() -> Callable[[Any], type]:
def _patched_type(obj: Any) -> type:
if isinstance(obj, MagicMock):
return Issue
return type(obj)
return _patched_type
@pytest.fixture
def mock_jira_api_version() -> Generator[Any, Any, Any]:
with patch("danswer.connectors.danswer_jira.connector.JIRA_API_VERSION", "2"):
@ -69,11 +67,9 @@ def mock_jira_api_version() -> Generator[Any, Any, Any]:
@pytest.fixture
def patched_environment(
patched_type: type,
mock_jira_api_version: MockFixture,
) -> Generator[Any, Any, Any]:
with patch("danswer.connectors.danswer_jira.connector.type", patched_type):
yield
yield
def test_fetch_jira_issues_batch_small_ticket(
@ -83,9 +79,8 @@ def test_fetch_jira_issues_batch_small_ticket(
) -> None:
mock_jira_client.search_issues.return_value = [mock_issue_small]
docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client)
docs = list(fetch_jira_issues_batch(mock_jira_client, "project = TEST", 50))
assert count == 1
assert len(docs) == 1
assert docs[0].id.endswith("/SMALL-1")
assert "Small description" in docs[0].sections[0].text
@ -100,9 +95,8 @@ def test_fetch_jira_issues_batch_large_ticket(
) -> None:
mock_jira_client.search_issues.return_value = [mock_issue_large]
docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client)
docs = list(fetch_jira_issues_batch(mock_jira_client, "project = TEST", 50))
assert count == 1
assert len(docs) == 0 # The large ticket should be skipped
@ -114,9 +108,8 @@ def test_fetch_jira_issues_batch_mixed_tickets(
) -> None:
mock_jira_client.search_issues.return_value = [mock_issue_small, mock_issue_large]
docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client)
docs = list(fetch_jira_issues_batch(mock_jira_client, "project = TEST", 50))
assert count == 2
assert len(docs) == 1 # Only the small ticket should be included
assert docs[0].id.endswith("/SMALL-1")
@ -130,7 +123,6 @@ def test_fetch_jira_issues_batch_custom_size_limit(
) -> None:
mock_jira_client.search_issues.return_value = [mock_issue_small, mock_issue_large]
docs, count = fetch_jira_issues_batch("project = TEST", 0, mock_jira_client)
docs = list(fetch_jira_issues_batch(mock_jira_client, "project = TEST", 50))
assert count == 2
assert len(docs) == 0 # Both tickets should be skipped due to the low size limit