diff --git a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py index 9ed8e9d82..a5d5471bd 100644 --- a/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py +++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py @@ -117,16 +117,16 @@ def generate_initial_answer( consolidated_context_docs = structured_subquestion_docs.cited_documents counter = 0 - for original_doc_number, original_doc in enumerate( - orig_question_retrieval_documents - ): - if original_doc_number not in structured_subquestion_docs.cited_documents: - if ( - counter <= AGENT_MIN_ORIG_QUESTION_DOCS - or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS - ): - consolidated_context_docs.append(original_doc) - counter += 1 + for original_doc in orig_question_retrieval_documents: + if original_doc in structured_subquestion_docs.cited_documents: + continue + + if ( + counter <= AGENT_MIN_ORIG_QUESTION_DOCS + or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS + ): + consolidated_context_docs.append(original_doc) + counter += 1 # sort docs by their scores - though the scores refer to different questions relevant_docs = dedup_inference_section_list(consolidated_context_docs) diff --git a/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_validate_refined_answer.py b/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_validate_refined_answer.py index 52f24a147..ef0312ebb 100644 --- a/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_validate_refined_answer.py +++ b/backend/onyx/agents/agent_search/deep_search/main/nodes/generate_validate_refined_answer.py @@ -146,10 +146,8 @@ def generate_validate_refined_answer( consolidated_context_docs = structured_subquestion_docs.cited_documents counter = 0 - for original_doc_number, original_doc in enumerate( - original_question_verified_documents - ): - if original_doc_number not in structured_subquestion_docs.cited_documents: + for original_doc in original_question_verified_documents: + if original_doc not in structured_subquestion_docs.cited_documents: if ( counter <= AGENT_MIN_ORIG_QUESTION_DOCS or len(consolidated_context_docs) diff --git a/backend/onyx/agents/agent_search/orchestration/nodes/choose_tool.py b/backend/onyx/agents/agent_search/orchestration/nodes/choose_tool.py index 1562d5d72..ba23808d8 100644 --- a/backend/onyx/agents/agent_search/orchestration/nodes/choose_tool.py +++ b/backend/onyx/agents/agent_search/orchestration/nodes/choose_tool.py @@ -57,7 +57,7 @@ def choose_tool( not agent_config.behavior.use_agentic_search and agent_config.tooling.search_tool is not None and ( - not force_use_tool.force_use or force_use_tool.tool_name == SearchTool.name + not force_use_tool.force_use or force_use_tool.tool_name == SearchTool._NAME ) ): override_kwargs = SearchToolOverrideKwargs() diff --git a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py index 5f3e68e6e..5240ae4f7 100644 --- a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py +++ b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py @@ -98,9 +98,6 @@ def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> b if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE: return False - if cc_pair.status == ConnectorCredentialPairStatus.DELETING: - return False - # If the last sync is None, it has never been run so we run the sync last_perm_sync = cc_pair.last_time_perm_sync if last_perm_sync is None: diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index f9764535b..beb74130a 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -420,7 +420,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo is_slim: bool, checkpoint: GoogleDriveCheckpoint, concurrent_drive_itr: Callable[[str], Iterator[str]], - filtered_folder_ids: set[str], + sorted_filtered_folder_ids: list[str], start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> Iterator[RetrievedDriveFile]: @@ -509,6 +509,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo yield from _yield_from_drive(drive_id, start) curr_stage.stage = DriveRetrievalStage.FOLDER_FILES resuming = False # we are starting the next stage for the first time + if curr_stage.stage == DriveRetrievalStage.FOLDER_FILES: def _yield_from_folder_crawl( @@ -526,16 +527,28 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo ) # resume from a checkpoint + last_processed_folder = None if resuming: folder_id = curr_stage.completed_until_parent_id assert folder_id is not None, "folder id not set in checkpoint" resume_start = curr_stage.completed_until yield from _yield_from_folder_crawl(folder_id, resume_start) + last_processed_folder = folder_id - remaining_folders = filtered_folder_ids - self._retrieved_ids - for folder_id in remaining_folders: + skipping_seen_folders = last_processed_folder is not None + for folder_id in sorted_filtered_folder_ids: + if skipping_seen_folders: + skipping_seen_folders = folder_id != last_processed_folder + continue + + if folder_id in self._retrieved_ids: + continue + + curr_stage.completed_until = 0 + curr_stage.completed_until_parent_id = folder_id logger.info(f"Getting files in folder '{folder_id}' as '{user_email}'") yield from _yield_from_folder_crawl(folder_id, start) + curr_stage.stage = DriveRetrievalStage.DONE def _manage_service_account_retrieval( @@ -584,11 +597,13 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo drive_ids_to_retrieve, checkpoint ) + sorted_filtered_folder_ids = sorted(folder_ids_to_retrieve) + # only process emails that we haven't already completed retrieval for non_completed_org_emails = [ user_email - for user_email, stage in checkpoint.completion_map.items() - if stage != DriveRetrievalStage.DONE + for user_email, stage_completion in checkpoint.completion_map.items() + if stage_completion.stage != DriveRetrievalStage.DONE ] # don't process too many emails before returning a checkpoint. This is @@ -609,7 +624,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo is_slim, checkpoint, drive_id_iterator, - folder_ids_to_retrieve, + sorted_filtered_folder_ids, start, end, ) diff --git a/backend/onyx/connectors/google_drive/file_retrieval.py b/backend/onyx/connectors/google_drive/file_retrieval.py index 2b432d0ae..2aaba823f 100644 --- a/backend/onyx/connectors/google_drive/file_retrieval.py +++ b/backend/onyx/connectors/google_drive/file_retrieval.py @@ -122,14 +122,17 @@ def crawl_folders_for_files( start=start, end=end, ): - found_files = True logger.info(f"Found file: {file['name']}, user email: {user_email}") + found_files = True yield RetrievedDriveFile( drive_file=file, user_email=user_email, parent_id=parent_id, completion_stage=DriveRetrievalStage.FOLDER_FILES, ) + # Only mark a folder as done if it was fully traversed without errors + if found_files: + update_traversed_ids_func(parent_id) except Exception as e: logger.error(f"Error getting files in parent {parent_id}: {e}") yield RetrievedDriveFile( @@ -139,8 +142,6 @@ def crawl_folders_for_files( completion_stage=DriveRetrievalStage.FOLDER_FILES, error=e, ) - if found_files: - update_traversed_ids_func(parent_id) else: logger.info(f"Skipping subfolder files since already traversed: {parent_id}") diff --git a/backend/onyx/context/search/postprocessing/postprocessing.py b/backend/onyx/context/search/postprocessing/postprocessing.py index 7043161f7..3ef613916 100644 --- a/backend/onyx/context/search/postprocessing/postprocessing.py +++ b/backend/onyx/context/search/postprocessing/postprocessing.py @@ -374,14 +374,6 @@ def filter_sections( if query.evaluation_type == LLMEvaluationType.SKIP: return [] - # Additional safeguard: Log a warning if this function is ever called with SKIP evaluation type - # This should never happen if our fast paths are working correctly - if query.evaluation_type == LLMEvaluationType.SKIP: - logger.warning( - "WARNING: filter_sections called with SKIP evaluation_type. This should never happen!" - ) - return [] - sections_to_filter = sections_to_filter[: query.max_llm_filter_sections] contents = [ @@ -461,12 +453,10 @@ def search_postprocessing( llm_filter_task_id = None # Only add LLM filtering if not in SKIP mode and if LLM doc relevance is not disabled - if ( - search_query.evaluation_type not in [LLMEvaluationType.SKIP] - and not DISABLE_LLM_DOC_RELEVANCE - and search_query.evaluation_type - in [LLMEvaluationType.BASIC, LLMEvaluationType.UNSPECIFIED] - ): + if not DISABLE_LLM_DOC_RELEVANCE and search_query.evaluation_type in [ + LLMEvaluationType.BASIC, + LLMEvaluationType.UNSPECIFIED, + ]: logger.info("Adding LLM filtering task for document relevance evaluation") post_processing_tasks.append( FunctionCall( @@ -479,8 +469,6 @@ def search_postprocessing( ) ) llm_filter_task_id = post_processing_tasks[-1].result_id - elif search_query.evaluation_type == LLMEvaluationType.SKIP: - logger.info("Fast path: Skipping LLM filtering task for ordering-only mode") elif DISABLE_LLM_DOC_RELEVANCE: logger.info("Skipping LLM filtering task because LLM doc relevance is disabled") diff --git a/backend/onyx/server/features/tool/api.py b/backend/onyx/server/features/tool/api.py index 8c10b87ec..31b9f4519 100644 --- a/backend/onyx/server/features/tool/api.py +++ b/backend/onyx/server/features/tool/api.py @@ -152,6 +152,6 @@ def list_tools( return [ ToolSnapshot.from_model(tool) for tool in tools - if tool.in_code_tool_id != ImageGenerationTool.name + if tool.in_code_tool_id != ImageGenerationTool._NAME or is_image_generation_available(db_session=db_session) ] diff --git a/backend/pyproject.toml b/backend/pyproject.toml index e43ebd62e..05ab993e3 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -4,6 +4,7 @@ mypy_path = "$MYPY_CONFIG_FILE_DIR" explicit_package_bases = true disallow_untyped_defs = true enable_error_code = ["possibly-undefined"] +strict_equality = true [[tool.mypy.overrides]] module = "alembic.versions.*" diff --git a/backend/scripts/sources_selection_analysis.py b/backend/scripts/sources_selection_analysis.py index addc7dc23..017c23ee5 100644 --- a/backend/scripts/sources_selection_analysis.py +++ b/backend/scripts/sources_selection_analysis.py @@ -186,7 +186,9 @@ class CompareAnalysis: ) return changes - def check_config_changes(self, previous_doc_rank: int, new_doc_rank: int) -> None: + def check_config_changes( + self, previous_doc_rank: int | str, new_doc_rank: int + ) -> None: """Try to identify possible reasons why a change has been detected by checking the latest document update date or the boost value. @@ -194,7 +196,7 @@ class CompareAnalysis: previous_doc_rank (int): The document rank for the previous analysis new_doc_rank (int): The document rank for the new analysis """ - if new_doc_rank == "not_ranked": + if isinstance(new_doc_rank, str) and new_doc_rank == "not_ranked": color_output( ( "NOTE: The document is missing in the 'current' analysis file. " diff --git a/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py b/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py index 7c5395660..93379e34f 100644 --- a/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py +++ b/backend/tests/daily/connectors/salesforce/test_salesforce_connector.py @@ -3,6 +3,7 @@ import os import time from pathlib import Path from typing import Any +from typing import cast import pytest @@ -24,7 +25,7 @@ def extract_key_value_pairs_to_set( def load_test_data( file_name: str = "test_salesforce_data.json", -) -> dict[str, list[str] | dict[str, Any]]: +) -> dict[str, str | list[str] | dict[str, Any] | list[dict[str, Any]]]: current_dir = Path(__file__).parent with open(current_dir / file_name, "r") as f: return json.load(f) @@ -90,7 +91,7 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) - if not isinstance(expected_text, list): raise ValueError("Expected text is not a list") - unparsed_expected_key_value_pairs: list[str] = expected_text + unparsed_expected_key_value_pairs: list[str] = cast(list[str], expected_text) received_key_value_pairs = extract_key_value_pairs_to_set(received_text) expected_key_value_pairs = extract_key_value_pairs_to_set( unparsed_expected_key_value_pairs @@ -110,7 +111,12 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) - assert primary_owner.first_name == expected_primary_owner["first_name"] assert primary_owner.last_name == expected_primary_owner["last_name"] - assert target_test_doc.secondary_owners == test_data["secondary_owners"] + secondary_owners = ( + [owner.model_dump() for owner in target_test_doc.secondary_owners] + if target_test_doc.secondary_owners + else None + ) + assert secondary_owners == test_data["secondary_owners"] assert target_test_doc.title == test_data["title"] diff --git a/backend/tests/integration/common_utils/managers/persona.py b/backend/tests/integration/common_utils/managers/persona.py index d4a436c2b..d7a01c795 100644 --- a/backend/tests/integration/common_utils/managers/persona.py +++ b/backend/tests/integration/common_utils/managers/persona.py @@ -243,7 +243,8 @@ class PersonaManager: and set(user.email for user in fetched_persona.users) == set(persona.users) and set(fetched_persona.groups) == set(persona.groups) - and set(fetched_persona.labels) == set(persona.label_ids) + and {label.id for label in fetched_persona.labels} + == set(persona.label_ids) ) return False diff --git a/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py b/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py index 7b406ae2e..df3dc7364 100644 --- a/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py +++ b/backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py @@ -192,7 +192,7 @@ def test_fuzzy_match_quotes_to_docs() -> None: results = match_quotes_to_docs( test_quotes, [test_chunk_0, test_chunk_1], fuzzy_search=True ) - assert results == { + assert results.model_dump() == { "a doc with some": {"document": "test doc 0", "link": "doc 0 base"}, "a doc with some LINK": { "document": "test doc 0",