mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-23 23:10:24 +02:00
Id not set in checkpoint2 (#4468)
* unconditionally set completion * drive connector improvements * fixing broader typing issue * fix tests, CW comments * actual test fix
This commit is contained in:
parent
9c73099241
commit
17562f9b8f
@ -117,16 +117,16 @@ def generate_initial_answer(
|
||||
|
||||
consolidated_context_docs = structured_subquestion_docs.cited_documents
|
||||
counter = 0
|
||||
for original_doc_number, original_doc in enumerate(
|
||||
orig_question_retrieval_documents
|
||||
):
|
||||
if original_doc_number not in structured_subquestion_docs.cited_documents:
|
||||
if (
|
||||
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
|
||||
or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS
|
||||
):
|
||||
consolidated_context_docs.append(original_doc)
|
||||
counter += 1
|
||||
for original_doc in orig_question_retrieval_documents:
|
||||
if original_doc in structured_subquestion_docs.cited_documents:
|
||||
continue
|
||||
|
||||
if (
|
||||
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
|
||||
or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS
|
||||
):
|
||||
consolidated_context_docs.append(original_doc)
|
||||
counter += 1
|
||||
|
||||
# sort docs by their scores - though the scores refer to different questions
|
||||
relevant_docs = dedup_inference_section_list(consolidated_context_docs)
|
||||
|
@ -146,10 +146,8 @@ def generate_validate_refined_answer(
|
||||
consolidated_context_docs = structured_subquestion_docs.cited_documents
|
||||
|
||||
counter = 0
|
||||
for original_doc_number, original_doc in enumerate(
|
||||
original_question_verified_documents
|
||||
):
|
||||
if original_doc_number not in structured_subquestion_docs.cited_documents:
|
||||
for original_doc in original_question_verified_documents:
|
||||
if original_doc not in structured_subquestion_docs.cited_documents:
|
||||
if (
|
||||
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
|
||||
or len(consolidated_context_docs)
|
||||
|
@ -57,7 +57,7 @@ def choose_tool(
|
||||
not agent_config.behavior.use_agentic_search
|
||||
and agent_config.tooling.search_tool is not None
|
||||
and (
|
||||
not force_use_tool.force_use or force_use_tool.tool_name == SearchTool.name
|
||||
not force_use_tool.force_use or force_use_tool.tool_name == SearchTool._NAME
|
||||
)
|
||||
):
|
||||
override_kwargs = SearchToolOverrideKwargs()
|
||||
|
@ -98,9 +98,6 @@ def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> b
|
||||
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
|
||||
return False
|
||||
|
||||
if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
|
||||
return False
|
||||
|
||||
# If the last sync is None, it has never been run so we run the sync
|
||||
last_perm_sync = cc_pair.last_time_perm_sync
|
||||
if last_perm_sync is None:
|
||||
|
@ -420,7 +420,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
is_slim: bool,
|
||||
checkpoint: GoogleDriveCheckpoint,
|
||||
concurrent_drive_itr: Callable[[str], Iterator[str]],
|
||||
filtered_folder_ids: set[str],
|
||||
sorted_filtered_folder_ids: list[str],
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[RetrievedDriveFile]:
|
||||
@ -509,6 +509,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
yield from _yield_from_drive(drive_id, start)
|
||||
curr_stage.stage = DriveRetrievalStage.FOLDER_FILES
|
||||
resuming = False # we are starting the next stage for the first time
|
||||
|
||||
if curr_stage.stage == DriveRetrievalStage.FOLDER_FILES:
|
||||
|
||||
def _yield_from_folder_crawl(
|
||||
@ -526,16 +527,28 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
)
|
||||
|
||||
# resume from a checkpoint
|
||||
last_processed_folder = None
|
||||
if resuming:
|
||||
folder_id = curr_stage.completed_until_parent_id
|
||||
assert folder_id is not None, "folder id not set in checkpoint"
|
||||
resume_start = curr_stage.completed_until
|
||||
yield from _yield_from_folder_crawl(folder_id, resume_start)
|
||||
last_processed_folder = folder_id
|
||||
|
||||
remaining_folders = filtered_folder_ids - self._retrieved_ids
|
||||
for folder_id in remaining_folders:
|
||||
skipping_seen_folders = last_processed_folder is not None
|
||||
for folder_id in sorted_filtered_folder_ids:
|
||||
if skipping_seen_folders:
|
||||
skipping_seen_folders = folder_id != last_processed_folder
|
||||
continue
|
||||
|
||||
if folder_id in self._retrieved_ids:
|
||||
continue
|
||||
|
||||
curr_stage.completed_until = 0
|
||||
curr_stage.completed_until_parent_id = folder_id
|
||||
logger.info(f"Getting files in folder '{folder_id}' as '{user_email}'")
|
||||
yield from _yield_from_folder_crawl(folder_id, start)
|
||||
|
||||
curr_stage.stage = DriveRetrievalStage.DONE
|
||||
|
||||
def _manage_service_account_retrieval(
|
||||
@ -584,11 +597,13 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
drive_ids_to_retrieve, checkpoint
|
||||
)
|
||||
|
||||
sorted_filtered_folder_ids = sorted(folder_ids_to_retrieve)
|
||||
|
||||
# only process emails that we haven't already completed retrieval for
|
||||
non_completed_org_emails = [
|
||||
user_email
|
||||
for user_email, stage in checkpoint.completion_map.items()
|
||||
if stage != DriveRetrievalStage.DONE
|
||||
for user_email, stage_completion in checkpoint.completion_map.items()
|
||||
if stage_completion.stage != DriveRetrievalStage.DONE
|
||||
]
|
||||
|
||||
# don't process too many emails before returning a checkpoint. This is
|
||||
@ -609,7 +624,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
is_slim,
|
||||
checkpoint,
|
||||
drive_id_iterator,
|
||||
folder_ids_to_retrieve,
|
||||
sorted_filtered_folder_ids,
|
||||
start,
|
||||
end,
|
||||
)
|
||||
|
@ -122,14 +122,17 @@ def crawl_folders_for_files(
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
found_files = True
|
||||
logger.info(f"Found file: {file['name']}, user email: {user_email}")
|
||||
found_files = True
|
||||
yield RetrievedDriveFile(
|
||||
drive_file=file,
|
||||
user_email=user_email,
|
||||
parent_id=parent_id,
|
||||
completion_stage=DriveRetrievalStage.FOLDER_FILES,
|
||||
)
|
||||
# Only mark a folder as done if it was fully traversed without errors
|
||||
if found_files:
|
||||
update_traversed_ids_func(parent_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting files in parent {parent_id}: {e}")
|
||||
yield RetrievedDriveFile(
|
||||
@ -139,8 +142,6 @@ def crawl_folders_for_files(
|
||||
completion_stage=DriveRetrievalStage.FOLDER_FILES,
|
||||
error=e,
|
||||
)
|
||||
if found_files:
|
||||
update_traversed_ids_func(parent_id)
|
||||
else:
|
||||
logger.info(f"Skipping subfolder files since already traversed: {parent_id}")
|
||||
|
||||
|
@ -374,14 +374,6 @@ def filter_sections(
|
||||
if query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
return []
|
||||
|
||||
# Additional safeguard: Log a warning if this function is ever called with SKIP evaluation type
|
||||
# This should never happen if our fast paths are working correctly
|
||||
if query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.warning(
|
||||
"WARNING: filter_sections called with SKIP evaluation_type. This should never happen!"
|
||||
)
|
||||
return []
|
||||
|
||||
sections_to_filter = sections_to_filter[: query.max_llm_filter_sections]
|
||||
|
||||
contents = [
|
||||
@ -461,12 +453,10 @@ def search_postprocessing(
|
||||
|
||||
llm_filter_task_id = None
|
||||
# Only add LLM filtering if not in SKIP mode and if LLM doc relevance is not disabled
|
||||
if (
|
||||
search_query.evaluation_type not in [LLMEvaluationType.SKIP]
|
||||
and not DISABLE_LLM_DOC_RELEVANCE
|
||||
and search_query.evaluation_type
|
||||
in [LLMEvaluationType.BASIC, LLMEvaluationType.UNSPECIFIED]
|
||||
):
|
||||
if not DISABLE_LLM_DOC_RELEVANCE and search_query.evaluation_type in [
|
||||
LLMEvaluationType.BASIC,
|
||||
LLMEvaluationType.UNSPECIFIED,
|
||||
]:
|
||||
logger.info("Adding LLM filtering task for document relevance evaluation")
|
||||
post_processing_tasks.append(
|
||||
FunctionCall(
|
||||
@ -479,8 +469,6 @@ def search_postprocessing(
|
||||
)
|
||||
)
|
||||
llm_filter_task_id = post_processing_tasks[-1].result_id
|
||||
elif search_query.evaluation_type == LLMEvaluationType.SKIP:
|
||||
logger.info("Fast path: Skipping LLM filtering task for ordering-only mode")
|
||||
elif DISABLE_LLM_DOC_RELEVANCE:
|
||||
logger.info("Skipping LLM filtering task because LLM doc relevance is disabled")
|
||||
|
||||
|
@ -152,6 +152,6 @@ def list_tools(
|
||||
return [
|
||||
ToolSnapshot.from_model(tool)
|
||||
for tool in tools
|
||||
if tool.in_code_tool_id != ImageGenerationTool.name
|
||||
if tool.in_code_tool_id != ImageGenerationTool._NAME
|
||||
or is_image_generation_available(db_session=db_session)
|
||||
]
|
||||
|
@ -4,6 +4,7 @@ mypy_path = "$MYPY_CONFIG_FILE_DIR"
|
||||
explicit_package_bases = true
|
||||
disallow_untyped_defs = true
|
||||
enable_error_code = ["possibly-undefined"]
|
||||
strict_equality = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = "alembic.versions.*"
|
||||
|
@ -186,7 +186,9 @@ class CompareAnalysis:
|
||||
)
|
||||
return changes
|
||||
|
||||
def check_config_changes(self, previous_doc_rank: int, new_doc_rank: int) -> None:
|
||||
def check_config_changes(
|
||||
self, previous_doc_rank: int | str, new_doc_rank: int
|
||||
) -> None:
|
||||
"""Try to identify possible reasons why a change has been detected by
|
||||
checking the latest document update date or the boost value.
|
||||
|
||||
@ -194,7 +196,7 @@ class CompareAnalysis:
|
||||
previous_doc_rank (int): The document rank for the previous analysis
|
||||
new_doc_rank (int): The document rank for the new analysis
|
||||
"""
|
||||
if new_doc_rank == "not_ranked":
|
||||
if isinstance(new_doc_rank, str) and new_doc_rank == "not_ranked":
|
||||
color_output(
|
||||
(
|
||||
"NOTE: The document is missing in the 'current' analysis file. "
|
||||
|
@ -3,6 +3,7 @@ import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
|
||||
@ -24,7 +25,7 @@ def extract_key_value_pairs_to_set(
|
||||
|
||||
def load_test_data(
|
||||
file_name: str = "test_salesforce_data.json",
|
||||
) -> dict[str, list[str] | dict[str, Any]]:
|
||||
) -> dict[str, str | list[str] | dict[str, Any] | list[dict[str, Any]]]:
|
||||
current_dir = Path(__file__).parent
|
||||
with open(current_dir / file_name, "r") as f:
|
||||
return json.load(f)
|
||||
@ -90,7 +91,7 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
|
||||
if not isinstance(expected_text, list):
|
||||
raise ValueError("Expected text is not a list")
|
||||
|
||||
unparsed_expected_key_value_pairs: list[str] = expected_text
|
||||
unparsed_expected_key_value_pairs: list[str] = cast(list[str], expected_text)
|
||||
received_key_value_pairs = extract_key_value_pairs_to_set(received_text)
|
||||
expected_key_value_pairs = extract_key_value_pairs_to_set(
|
||||
unparsed_expected_key_value_pairs
|
||||
@ -110,7 +111,12 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
|
||||
assert primary_owner.first_name == expected_primary_owner["first_name"]
|
||||
assert primary_owner.last_name == expected_primary_owner["last_name"]
|
||||
|
||||
assert target_test_doc.secondary_owners == test_data["secondary_owners"]
|
||||
secondary_owners = (
|
||||
[owner.model_dump() for owner in target_test_doc.secondary_owners]
|
||||
if target_test_doc.secondary_owners
|
||||
else None
|
||||
)
|
||||
assert secondary_owners == test_data["secondary_owners"]
|
||||
assert target_test_doc.title == test_data["title"]
|
||||
|
||||
|
||||
|
@ -243,7 +243,8 @@ class PersonaManager:
|
||||
and set(user.email for user in fetched_persona.users)
|
||||
== set(persona.users)
|
||||
and set(fetched_persona.groups) == set(persona.groups)
|
||||
and set(fetched_persona.labels) == set(persona.label_ids)
|
||||
and {label.id for label in fetched_persona.labels}
|
||||
== set(persona.label_ids)
|
||||
)
|
||||
return False
|
||||
|
||||
|
@ -192,7 +192,7 @@ def test_fuzzy_match_quotes_to_docs() -> None:
|
||||
results = match_quotes_to_docs(
|
||||
test_quotes, [test_chunk_0, test_chunk_1], fuzzy_search=True
|
||||
)
|
||||
assert results == {
|
||||
assert results.model_dump() == {
|
||||
"a doc with some": {"document": "test doc 0", "link": "doc 0 base"},
|
||||
"a doc with some LINK": {
|
||||
"document": "test doc 0",
|
||||
|
Loading…
x
Reference in New Issue
Block a user