Pull in more fields for Jira (#4547)

* Pull in more fields for Jira

* Fix tests

* Fix

* more fix

* Fix

* Fix S3 test

* fix
This commit is contained in:
Chris Weaver 2025-04-16 18:52:50 -07:00 committed by GitHub
parent fe94bdf936
commit 6df1c6c72f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 108 additions and 36 deletions

View File

@ -44,6 +44,18 @@ JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
_JIRA_SLIM_PAGE_SIZE = 500
_JIRA_FULL_PAGE_SIZE = 50
# Constants for Jira field names
_FIELD_REPORTER = "reporter"
_FIELD_ASSIGNEE = "assignee"
_FIELD_PRIORITY = "priority"
_FIELD_STATUS = "status"
_FIELD_RESOLUTION = "resolution"
_FIELD_LABELS = "labels"
_FIELD_KEY = "key"
_FIELD_CREATED = "created"
_FIELD_DUEDATE = "duedate"
_FIELD_ISSUETYPE = "issuetype"
def _perform_jql_search(
jira_client: JIRA,
@ -107,32 +119,40 @@ def process_jira_issue(
page_url = build_jira_url(jira_client, issue.key)
metadata_dict: dict[str, str | list[str]] = {}
people = set()
try:
creator = best_effort_get_field_from_issue(issue, "creator")
creator = best_effort_get_field_from_issue(issue, _FIELD_REPORTER)
if basic_expert_info := best_effort_basic_expert_info(creator):
people.add(basic_expert_info)
metadata_dict[_FIELD_REPORTER] = basic_expert_info.get_semantic_name()
except Exception:
# Author should exist but if not, doesn't matter
pass
try:
assignee = best_effort_get_field_from_issue(issue, "assignee")
assignee = best_effort_get_field_from_issue(issue, _FIELD_ASSIGNEE)
if basic_expert_info := best_effort_basic_expert_info(assignee):
people.add(basic_expert_info)
metadata_dict[_FIELD_ASSIGNEE] = basic_expert_info.get_semantic_name()
except Exception:
# Author should exist but if not, doesn't matter
pass
metadata_dict = {}
if priority := best_effort_get_field_from_issue(issue, "priority"):
metadata_dict["priority"] = priority.name
if status := best_effort_get_field_from_issue(issue, "status"):
metadata_dict["status"] = status.name
if resolution := best_effort_get_field_from_issue(issue, "resolution"):
metadata_dict["resolution"] = resolution.name
if labels := best_effort_get_field_from_issue(issue, "labels"):
metadata_dict["labels"] = labels
if priority := best_effort_get_field_from_issue(issue, _FIELD_PRIORITY):
metadata_dict[_FIELD_PRIORITY] = priority.name
if status := best_effort_get_field_from_issue(issue, _FIELD_STATUS):
metadata_dict[_FIELD_STATUS] = status.name
if resolution := best_effort_get_field_from_issue(issue, _FIELD_RESOLUTION):
metadata_dict[_FIELD_RESOLUTION] = resolution.name
if labels := best_effort_get_field_from_issue(issue, _FIELD_LABELS):
metadata_dict[_FIELD_LABELS] = labels
if created := best_effort_get_field_from_issue(issue, _FIELD_CREATED):
metadata_dict[_FIELD_CREATED] = created
if duedate := best_effort_get_field_from_issue(issue, _FIELD_DUEDATE):
metadata_dict[_FIELD_DUEDATE] = duedate
if issuetype := best_effort_get_field_from_issue(issue, _FIELD_ISSUETYPE):
metadata_dict[_FIELD_ISSUETYPE] = issuetype.name
return Document(
id=page_url,
@ -277,7 +297,7 @@ class JiraConnector(CheckpointedConnector[JiraConnectorCheckpoint], SlimConnecto
max_results=_JIRA_SLIM_PAGE_SIZE,
fields="key",
):
issue_key = best_effort_get_field_from_issue(issue, "key")
issue_key = best_effort_get_field_from_issue(issue, _FIELD_KEY)
id = build_jira_url(self.jira_client, issue_key)
slim_doc_batch.append(
SlimDocument(

View File

@ -23,8 +23,8 @@ JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None:
display_name = None
email = None
if hasattr(obj, "display_name"):
display_name = obj.display_name
if hasattr(obj, "displayName"):
display_name = obj.displayName
else:
display_name = obj.get("displayName")

View File

@ -9,7 +9,6 @@ from onyx.connectors.blob.connector import BlobStorageConnector
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import get_file_ext
@ -42,7 +41,8 @@ def test_blob_s3_connector(
"""
Plain and document file types should be fully indexed.
Multimedia and unknown file types will be indexed by title only with one empty section.
Multimedia and unknown file types will be indexed be skipped unless `set_allow_images`
is called with `True`.
This is intentional in order to allow searching by just the title even if we can't
index the file content.
@ -53,8 +53,7 @@ def test_blob_s3_connector(
for doc in doc_batch:
all_docs.append(doc)
#
assert len(all_docs) == 19
assert len(all_docs) == 15
for doc in all_docs:
section = doc.sections[0]
@ -69,9 +68,5 @@ def test_blob_s3_connector(
assert len(section.text) > 0
continue
if file_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS:
assert len(section.text) == 0
continue
# unknown extension
assert len(section.text) == 0

View File

@ -34,7 +34,12 @@ def confluence_connector() -> ConfluenceConnector:
# This should never fail because even if the docs in the cloud change,
# the full doc ids retrieved should always be a subset of the slim doc ids
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_confluence_connector_permissions(
mock_get_api_key: MagicMock,
confluence_connector: ConfluenceConnector,
) -> None:
# Get all doc IDs from the full connector
@ -76,6 +81,8 @@ def test_confluence_connector_restriction_handling(
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
"confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
}
# this prevents redis calls inside of OnyxConfluence
mock_provider_instance.is_dynamic.return_value = False
# Make the class return our configured instance when called
mock_db_provider_class.return_value = mock_provider_instance

View File

@ -1,9 +1,12 @@
import os
import time
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.onyx_jira.connector import JiraConnector
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
@ -24,25 +27,72 @@ def jira_connector() -> JiraConnector:
return connector
def test_jira_connector_basic(jira_connector: JiraConnector) -> None:
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_jira_connector_basic(
mock_get_api_key: MagicMock, jira_connector: JiraConnector
) -> None:
docs = load_all_docs_from_checkpoint_connector(
connector=jira_connector,
start=0,
end=time.time(),
)
assert len(docs) == 1
doc = docs[0]
assert len(docs) == 2
assert doc.id == "https://danswerai.atlassian.net/browse/AS-2"
assert doc.semantic_identifier == "AS-2: test123small"
assert doc.source == DocumentSource.JIRA
assert doc.metadata == {"priority": "Medium", "status": "Backlog"}
assert doc.secondary_owners is None
assert doc.title == "AS-2 test123small"
assert doc.from_ingestion_api is False
assert doc.additional_info is None
# Find story and epic
story: Document | None = None
epic: Document | None = None
for doc in docs:
if doc.metadata["issuetype"] == "Story":
story = doc
elif doc.metadata["issuetype"] == "Epic":
epic = doc
assert len(doc.sections) == 1
section = doc.sections[0]
assert story is not None
assert epic is not None
# Check task
assert story.id == "https://danswerai.atlassian.net/browse/AS-3"
assert story.semantic_identifier == "AS-3: test123small"
assert story.source == DocumentSource.JIRA
assert story.metadata == {
"priority": "Medium",
"status": "Backlog",
"reporter": "Chris Weaver",
"assignee": "Chris Weaver",
"issuetype": "Story",
"created": "2025-04-16T16:44:06.716-0700",
}
assert story.secondary_owners is None
assert story.title == "AS-3 test123small"
assert story.from_ingestion_api is False
assert story.additional_info is None
assert len(story.sections) == 1
section = story.sections[0]
assert section.text == "example_text\n"
assert section.link == "https://danswerai.atlassian.net/browse/AS-2"
assert section.link == "https://danswerai.atlassian.net/browse/AS-3"
# Check epic
assert epic.id == "https://danswerai.atlassian.net/browse/AS-4"
assert epic.semantic_identifier == "AS-4: EPIC"
assert epic.source == DocumentSource.JIRA
assert epic.metadata == {
"priority": "Medium",
"status": "Backlog",
"reporter": "Founder Onyx",
"assignee": "Chris Weaver",
"issuetype": "Epic",
"created": "2025-04-16T16:55:53.068-0700",
}
assert epic.secondary_owners is None
assert epic.title == "AS-4 EPIC"
assert epic.from_ingestion_api is False
assert epic.additional_info is None
assert len(epic.sections) == 1
section = epic.sections[0]
assert section.text == "example_text\n"
assert section.link == "https://danswerai.atlassian.net/browse/AS-4"