Feature/log despam (#2022)

* move a lot of log spam to debug level. Consolidate some info level logging

* reformat more indexing logging
This commit is contained in:
rkuo-danswer
2024-08-02 08:28:53 -07:00
committed by GitHub
parent 51731ad0dd
commit 6a61331cba
3 changed files with 37 additions and 21 deletions

View File

@@ -277,11 +277,9 @@ def _run_indexing(
run_dt=run_end_dt, run_dt=run_end_dt,
) )
elapsed_time = time.time() - start_time
logger.info( logger.info(
f"Indexed or refreshed {document_count} total documents for a total of {chunk_count} indexed chunks" f"Connector succeeded: docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
)
logger.info(
f"Connector successfully finished, elapsed time: {time.time() - start_time} seconds"
) )
@@ -330,17 +328,19 @@ def run_indexing_entrypoint(index_attempt_id: int, is_ee: bool = False) -> None:
attempt = _prepare_index_attempt(db_session, index_attempt_id) attempt = _prepare_index_attempt(db_session, index_attempt_id)
logger.info( logger.info(
f"Running indexing attempt for connector: '{attempt.connector_credential_pair.connector.name}', " f"Indexing starting: "
f"with config: '{attempt.connector_credential_pair.connector.connector_specific_config}', and " f"connector='{attempt.connector_credential_pair.connector.name}' "
f"with credentials: '{attempt.connector_credential_pair.connector_id}'" f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"credentials='{attempt.connector_credential_pair.connector_id}'"
) )
_run_indexing(db_session, attempt) _run_indexing(db_session, attempt)
logger.info( logger.info(
f"Completed indexing attempt for connector: '{attempt.connector_credential_pair.connector.name}', " f"Indexing finished: "
f"with config: '{attempt.connector_credential_pair.connector.connector_specific_config}', and " f"connector='{attempt.connector_credential_pair.connector.name}' "
f"with credentials: '{attempt.connector_credential_pair.connector_id}'" f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"credentials='{attempt.connector_credential_pair.connector_id}'"
) )
except Exception as e: except Exception as e:
logger.exception(f"Indexing job with ID '{index_attempt_id}' failed due to {e}") logger.exception(f"Indexing job with ID '{index_attempt_id}' failed due to {e}")

View File

@@ -283,11 +283,13 @@ def kickoff_indexing_jobs(
if attempt.id not in existing_jobs if attempt.id not in existing_jobs
] ]
logger.info(f"Found {len(new_indexing_attempts)} new indexing tasks.") logger.debug(f"Found {len(new_indexing_attempts)} new indexing task(s).")
if not new_indexing_attempts: if not new_indexing_attempts:
return existing_jobs return existing_jobs
indexing_attempt_count = 0
for attempt, embedding_model in new_indexing_attempts: for attempt, embedding_model in new_indexing_attempts:
use_secondary_index = ( use_secondary_index = (
embedding_model.status == IndexModelStatus.FUTURE embedding_model.status == IndexModelStatus.FUTURE
@@ -329,15 +331,29 @@ def kickoff_indexing_jobs(
) )
if run: if run:
secondary_str = "(secondary index) " if use_secondary_index else "" if indexing_attempt_count == 0:
logger.info(
f"Indexing dispatch starts: pending={len(new_indexing_attempts)}"
)
indexing_attempt_count += 1
secondary_str = " (secondary index)" if use_secondary_index else ""
logger.info( logger.info(
f"Kicked off {secondary_str}" f"Indexing dispatched{secondary_str}: "
f"indexing attempt for connector: '{attempt.connector_credential_pair.connector.name}', " f"connector='{attempt.connector_credential_pair.connector.name}' "
f"with config: '{attempt.connector_credential_pair.connector.connector_specific_config}', and " f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"with credentials: '{attempt.connector_credential_pair.credential_id}'" f"credentials='{attempt.connector_credential_pair.credential_id}'"
) )
existing_jobs_copy[attempt.id] = run existing_jobs_copy[attempt.id] = run
if indexing_attempt_count > 0:
logger.info(
f"Indexing dispatch results: "
f"initial_pending={len(new_indexing_attempts)} "
f"started={indexing_attempt_count} "
f"remaining={len(new_indexing_attempts) - indexing_attempt_count}"
)
return existing_jobs_copy return existing_jobs_copy
@@ -355,7 +371,7 @@ def update_loop(
# batch of documents indexed # batch of documents indexed
if db_embedding_model.cloud_provider_id is None: if db_embedding_model.cloud_provider_id is None:
logger.info("Running a first inference to warm up embedding model") logger.debug("Running a first inference to warm up embedding model")
warm_up_encoders( warm_up_encoders(
embedding_model=db_embedding_model, embedding_model=db_embedding_model,
model_server_host=INDEXING_MODEL_SERVER_HOST, model_server_host=INDEXING_MODEL_SERVER_HOST,
@@ -392,11 +408,11 @@ def update_loop(
while True: while True:
start = time.time() start = time.time()
start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S") start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
logger.info(f"Running update, current UTC time: {start_time_utc}") logger.debug(f"Running update, current UTC time: {start_time_utc}")
if existing_jobs: if existing_jobs:
# TODO: make this debug level once the "no jobs are being scheduled" issue is resolved # TODO: make this debug level once the "no jobs are being scheduled" issue is resolved
logger.info( logger.debug(
"Found existing indexing jobs: " "Found existing indexing jobs: "
f"{[(attempt_id, job.status) for attempt_id, job in existing_jobs.items()]}" f"{[(attempt_id, job.status) for attempt_id, job in existing_jobs.items()]}"
) )
@@ -422,7 +438,7 @@ def update__main() -> None:
set_is_ee_based_on_env_variable() set_is_ee_based_on_env_variable()
init_sqlalchemy_engine(POSTGRES_INDEXER_APP_NAME) init_sqlalchemy_engine(POSTGRES_INDEXER_APP_NAME)
logger.info("Starting Indexing Loop") logger.info("Starting indexing service")
update_loop() update_loop()

View File

@@ -217,7 +217,7 @@ def warm_up_encoders(
) )
# May not be the exact same tokenizer used for the indexing flow # May not be the exact same tokenizer used for the indexing flow
logger.info(f"Warming up encoder model: {model_name}") logger.debug(f"Warming up encoder model: {model_name}")
get_tokenizer(model_name=model_name, provider_type=provider_type).encode( get_tokenizer(model_name=model_name, provider_type=provider_type).encode(
warm_up_str warm_up_str
) )