Pull in more fields for Jira (#4547)

* Pull in more fields for Jira

* Fix tests

* Fix

* more fix

* Fix

* Fix S3 test

* fix
This commit is contained in:
Chris Weaver 2025-04-16 18:52:50 -07:00 committed by GitHub
parent fe94bdf936
commit 6df1c6c72f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 108 additions and 36 deletions

View File

@ -44,6 +44,18 @@ JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
_JIRA_SLIM_PAGE_SIZE = 500 _JIRA_SLIM_PAGE_SIZE = 500
_JIRA_FULL_PAGE_SIZE = 50 _JIRA_FULL_PAGE_SIZE = 50
# Constants for Jira field names
_FIELD_REPORTER = "reporter"
_FIELD_ASSIGNEE = "assignee"
_FIELD_PRIORITY = "priority"
_FIELD_STATUS = "status"
_FIELD_RESOLUTION = "resolution"
_FIELD_LABELS = "labels"
_FIELD_KEY = "key"
_FIELD_CREATED = "created"
_FIELD_DUEDATE = "duedate"
_FIELD_ISSUETYPE = "issuetype"
def _perform_jql_search( def _perform_jql_search(
jira_client: JIRA, jira_client: JIRA,
@ -107,32 +119,40 @@ def process_jira_issue(
page_url = build_jira_url(jira_client, issue.key) page_url = build_jira_url(jira_client, issue.key)
metadata_dict: dict[str, str | list[str]] = {}
people = set() people = set()
try: try:
creator = best_effort_get_field_from_issue(issue, "creator") creator = best_effort_get_field_from_issue(issue, _FIELD_REPORTER)
if basic_expert_info := best_effort_basic_expert_info(creator): if basic_expert_info := best_effort_basic_expert_info(creator):
people.add(basic_expert_info) people.add(basic_expert_info)
metadata_dict[_FIELD_REPORTER] = basic_expert_info.get_semantic_name()
except Exception: except Exception:
# Author should exist but if not, doesn't matter # Author should exist but if not, doesn't matter
pass pass
try: try:
assignee = best_effort_get_field_from_issue(issue, "assignee") assignee = best_effort_get_field_from_issue(issue, _FIELD_ASSIGNEE)
if basic_expert_info := best_effort_basic_expert_info(assignee): if basic_expert_info := best_effort_basic_expert_info(assignee):
people.add(basic_expert_info) people.add(basic_expert_info)
metadata_dict[_FIELD_ASSIGNEE] = basic_expert_info.get_semantic_name()
except Exception: except Exception:
# Author should exist but if not, doesn't matter # Author should exist but if not, doesn't matter
pass pass
metadata_dict = {} if priority := best_effort_get_field_from_issue(issue, _FIELD_PRIORITY):
if priority := best_effort_get_field_from_issue(issue, "priority"): metadata_dict[_FIELD_PRIORITY] = priority.name
metadata_dict["priority"] = priority.name if status := best_effort_get_field_from_issue(issue, _FIELD_STATUS):
if status := best_effort_get_field_from_issue(issue, "status"): metadata_dict[_FIELD_STATUS] = status.name
metadata_dict["status"] = status.name if resolution := best_effort_get_field_from_issue(issue, _FIELD_RESOLUTION):
if resolution := best_effort_get_field_from_issue(issue, "resolution"): metadata_dict[_FIELD_RESOLUTION] = resolution.name
metadata_dict["resolution"] = resolution.name if labels := best_effort_get_field_from_issue(issue, _FIELD_LABELS):
if labels := best_effort_get_field_from_issue(issue, "labels"): metadata_dict[_FIELD_LABELS] = labels
metadata_dict["labels"] = labels if created := best_effort_get_field_from_issue(issue, _FIELD_CREATED):
metadata_dict[_FIELD_CREATED] = created
if duedate := best_effort_get_field_from_issue(issue, _FIELD_DUEDATE):
metadata_dict[_FIELD_DUEDATE] = duedate
if issuetype := best_effort_get_field_from_issue(issue, _FIELD_ISSUETYPE):
metadata_dict[_FIELD_ISSUETYPE] = issuetype.name
return Document( return Document(
id=page_url, id=page_url,
@ -277,7 +297,7 @@ class JiraConnector(CheckpointedConnector[JiraConnectorCheckpoint], SlimConnecto
max_results=_JIRA_SLIM_PAGE_SIZE, max_results=_JIRA_SLIM_PAGE_SIZE,
fields="key", fields="key",
): ):
issue_key = best_effort_get_field_from_issue(issue, "key") issue_key = best_effort_get_field_from_issue(issue, _FIELD_KEY)
id = build_jira_url(self.jira_client, issue_key) id = build_jira_url(self.jira_client, issue_key)
slim_doc_batch.append( slim_doc_batch.append(
SlimDocument( SlimDocument(

View File

@ -23,8 +23,8 @@ JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None: def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None:
display_name = None display_name = None
email = None email = None
if hasattr(obj, "display_name"): if hasattr(obj, "displayName"):
display_name = obj.display_name display_name = obj.displayName
else: else:
display_name = obj.get("displayName") display_name = obj.get("displayName")

View File

@ -9,7 +9,6 @@ from onyx.connectors.blob.connector import BlobStorageConnector
from onyx.connectors.models import Document from onyx.connectors.models import Document
from onyx.connectors.models import TextSection from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import get_file_ext from onyx.file_processing.extract_file_text import get_file_ext
@ -42,7 +41,8 @@ def test_blob_s3_connector(
""" """
Plain and document file types should be fully indexed. Plain and document file types should be fully indexed.
Multimedia and unknown file types will be indexed by title only with one empty section. Multimedia and unknown file types will be indexed be skipped unless `set_allow_images`
is called with `True`.
This is intentional in order to allow searching by just the title even if we can't This is intentional in order to allow searching by just the title even if we can't
index the file content. index the file content.
@ -53,8 +53,7 @@ def test_blob_s3_connector(
for doc in doc_batch: for doc in doc_batch:
all_docs.append(doc) all_docs.append(doc)
# assert len(all_docs) == 15
assert len(all_docs) == 19
for doc in all_docs: for doc in all_docs:
section = doc.sections[0] section = doc.sections[0]
@ -69,9 +68,5 @@ def test_blob_s3_connector(
assert len(section.text) > 0 assert len(section.text) > 0
continue continue
if file_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS:
assert len(section.text) == 0
continue
# unknown extension # unknown extension
assert len(section.text) == 0 assert len(section.text) == 0

View File

@ -34,7 +34,12 @@ def confluence_connector() -> ConfluenceConnector:
# This should never fail because even if the docs in the cloud change, # This should never fail because even if the docs in the cloud change,
# the full doc ids retrieved should always be a subset of the slim doc ids # the full doc ids retrieved should always be a subset of the slim doc ids
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_confluence_connector_permissions( def test_confluence_connector_permissions(
mock_get_api_key: MagicMock,
confluence_connector: ConfluenceConnector, confluence_connector: ConfluenceConnector,
) -> None: ) -> None:
# Get all doc IDs from the full connector # Get all doc IDs from the full connector
@ -76,6 +81,8 @@ def test_confluence_connector_restriction_handling(
"confluence_username": os.environ["CONFLUENCE_USER_NAME"], "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
"confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
} }
# this prevents redis calls inside of OnyxConfluence
mock_provider_instance.is_dynamic.return_value = False
# Make the class return our configured instance when called # Make the class return our configured instance when called
mock_db_provider_class.return_value = mock_provider_instance mock_db_provider_class.return_value = mock_provider_instance

View File

@ -1,9 +1,12 @@
import os import os
import time import time
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest import pytest
from onyx.configs.constants import DocumentSource from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.onyx_jira.connector import JiraConnector from onyx.connectors.onyx_jira.connector import JiraConnector
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
@ -24,25 +27,72 @@ def jira_connector() -> JiraConnector:
return connector return connector
def test_jira_connector_basic(jira_connector: JiraConnector) -> None: @patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_jira_connector_basic(
mock_get_api_key: MagicMock, jira_connector: JiraConnector
) -> None:
docs = load_all_docs_from_checkpoint_connector( docs = load_all_docs_from_checkpoint_connector(
connector=jira_connector, connector=jira_connector,
start=0, start=0,
end=time.time(), end=time.time(),
) )
assert len(docs) == 1 assert len(docs) == 2
doc = docs[0]
assert doc.id == "https://danswerai.atlassian.net/browse/AS-2" # Find story and epic
assert doc.semantic_identifier == "AS-2: test123small" story: Document | None = None
assert doc.source == DocumentSource.JIRA epic: Document | None = None
assert doc.metadata == {"priority": "Medium", "status": "Backlog"} for doc in docs:
assert doc.secondary_owners is None if doc.metadata["issuetype"] == "Story":
assert doc.title == "AS-2 test123small" story = doc
assert doc.from_ingestion_api is False elif doc.metadata["issuetype"] == "Epic":
assert doc.additional_info is None epic = doc
assert len(doc.sections) == 1 assert story is not None
section = doc.sections[0] assert epic is not None
# Check task
assert story.id == "https://danswerai.atlassian.net/browse/AS-3"
assert story.semantic_identifier == "AS-3: test123small"
assert story.source == DocumentSource.JIRA
assert story.metadata == {
"priority": "Medium",
"status": "Backlog",
"reporter": "Chris Weaver",
"assignee": "Chris Weaver",
"issuetype": "Story",
"created": "2025-04-16T16:44:06.716-0700",
}
assert story.secondary_owners is None
assert story.title == "AS-3 test123small"
assert story.from_ingestion_api is False
assert story.additional_info is None
assert len(story.sections) == 1
section = story.sections[0]
assert section.text == "example_text\n" assert section.text == "example_text\n"
assert section.link == "https://danswerai.atlassian.net/browse/AS-2" assert section.link == "https://danswerai.atlassian.net/browse/AS-3"
# Check epic
assert epic.id == "https://danswerai.atlassian.net/browse/AS-4"
assert epic.semantic_identifier == "AS-4: EPIC"
assert epic.source == DocumentSource.JIRA
assert epic.metadata == {
"priority": "Medium",
"status": "Backlog",
"reporter": "Founder Onyx",
"assignee": "Chris Weaver",
"issuetype": "Epic",
"created": "2025-04-16T16:55:53.068-0700",
}
assert epic.secondary_owners is None
assert epic.title == "AS-4 EPIC"
assert epic.from_ingestion_api is False
assert epic.additional_info is None
assert len(epic.sections) == 1
section = epic.sections[0]
assert section.text == "example_text\n"
assert section.link == "https://danswerai.atlassian.net/browse/AS-4"