From f52d1142ebdb77a1c78be10fe67ea6b2bc93b30c Mon Sep 17 00:00:00 2001
From: rkuo-danswer <rkuo@danswer.ai>
Date: Mon, 9 Sep 2024 20:10:25 -0700
Subject: [PATCH] =?UTF-8?q?Fail=20instead=20of=20continuing=20if=20vespa?=
 =?UTF-8?q?=20cannot=20be=20reached=20within=20the=20time=E2=80=A6=20(#237?=
 =?UTF-8?q?9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fail instead of continuing if vespa cannot be reached within the timeout period

* improve startup readability

---------

Co-authored-by: Richard Kuo <rkuo@rkuo.com>
---
 backend/danswer/main.py                       | 32 ++++++++++++++-----
 .../search_nlp_models.py                      |  4 +--
 .../tests/integration/common_utils/reset.py   |  4 ++-
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/backend/danswer/main.py b/backend/danswer/main.py
index c518e463e6d..a00826f11c8 100644
--- a/backend/danswer/main.py
+++ b/backend/danswer/main.py
@@ -320,21 +320,32 @@ def setup_vespa(
     document_index: DocumentIndex,
     index_setting: IndexingSetting,
     secondary_index_setting: IndexingSetting | None,
-) -> None:
+) -> bool:
     # Vespa startup is a bit slow, so give it a few seconds
-    wait_time = 5
-    for _ in range(5):
+    WAIT_SECONDS = 5
+    VESPA_ATTEMPTS = 5
+    for x in range(VESPA_ATTEMPTS):
         try:
+            logger.notice(f"Setting up Vespa (attempt {x+1}/{VESPA_ATTEMPTS})...")
             document_index.ensure_indices_exist(
                 index_embedding_dim=index_setting.model_dim,
                 secondary_index_embedding_dim=secondary_index_setting.model_dim
                 if secondary_index_setting
                 else None,
             )
-            break
+
+            logger.notice("Vespa setup complete.")
+            return True
         except Exception:
-            logger.notice(f"Waiting on Vespa, retrying in {wait_time} seconds...")
-            time.sleep(wait_time)
+            logger.notice(
+                f"Vespa setup did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds."
+            )
+            time.sleep(WAIT_SECONDS)
+
+    logger.error(
+        f"Vespa setup did not succeed. Attempt limit reached. ({VESPA_ATTEMPTS})"
+    )
+    return False
 
 
 @asynccontextmanager
@@ -357,7 +368,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
     # fill up Postgres connection pools
     await warm_up_connections()
 
-    # We cache this at the beginning so there is no delay in the first telemtry
+    # We cache this at the beginning so there is no delay in the first telemetry
     get_or_generate_uuid()
 
     with Session(engine) as db_session:
@@ -419,13 +430,18 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
             if secondary_search_settings
             else None,
         )
-        setup_vespa(
+
+        success = setup_vespa(
             document_index,
             IndexingSetting.from_db_model(search_settings),
             IndexingSetting.from_db_model(secondary_search_settings)
             if secondary_search_settings
             else None,
         )
+        if not success:
+            raise RuntimeError(
+                "Could not connect to Vespa within the specified timeout."
+            )
 
         logger.notice(f"Model Server: http://{MODEL_SERVER_HOST}:{MODEL_SERVER_PORT}")
         if search_settings.provider_type is None:
diff --git a/backend/danswer/natural_language_processing/search_nlp_models.py b/backend/danswer/natural_language_processing/search_nlp_models.py
index 117205761fe..6dcec724345 100644
--- a/backend/danswer/natural_language_processing/search_nlp_models.py
+++ b/backend/danswer/natural_language_processing/search_nlp_models.py
@@ -352,8 +352,8 @@ def warm_up_retry(
                 return func(*args, **kwargs)
             except Exception as e:
                 exceptions.append(e)
-                logger.exception(
-                    f"Attempt {attempt + 1} failed; retrying in {delay} seconds..."
+                logger.info(
+                    f"Attempt {attempt + 1}/{tries} failed; retrying in {delay} seconds..."
                 )
                 time.sleep(delay)
         raise Exception(f"All retries failed: {exceptions}")
diff --git a/backend/tests/integration/common_utils/reset.py b/backend/tests/integration/common_utils/reset.py
index 0b13b96501f..a13ec184b45 100644
--- a/backend/tests/integration/common_utils/reset.py
+++ b/backend/tests/integration/common_utils/reset.py
@@ -131,11 +131,13 @@ def reset_vespa() -> None:
         search_settings = get_current_search_settings(db_session)
         index_name = search_settings.index_name
 
-    setup_vespa(
+    success = setup_vespa(
         document_index=VespaIndex(index_name=index_name, secondary_index_name=None),
         index_setting=IndexingSetting.from_db_model(search_settings),
         secondary_index_setting=None,
     )
+    if not success:
+        raise RuntimeError("Could not connect to Vespa within the specified timeout.")
 
     for _ in range(5):
         try: