Reworked chunking to support mega chunks (#2032)

2025-07-12 14:12:53 +02:00 · 2024-08-14 22:18:53 -07:00
parent 680388537b
commit 61b5bd569b
27 changed files with 1522 additions and 1156 deletions
--- a/backend/tests/regression/answer_quality/README.md
+++ b/backend/tests/regression/answer_quality/README.md
@ -17,14 +17,17 @@ This Python script automates the process of running search quality tests for a b
 1. Ensure you have the required dependencies installed.
 2. Configure the `search_test_config.yaml` file based on the `search_test_config.yaml.template` file.
 3. Configure the `.env_eval` file in `deployment/docker_compose` with the correct environment variables.
-4. Navigate to Danswer repo:
+4. Set up the PYTHONPATH permanently:
+   Add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, or `~/.bash_profile`):
+   ```
+   export PYTHONPATH=$PYTHONPATH:/path/to/danswer/backend
+   ```
+   Replace `/path/to/danswer` with the actual path to your Danswer repository.
+   After adding this line, restart your terminal or run `source ~/.bashrc` (or the appropriate config file) to apply the changes.
+5. Navigate to Danswer repo:
 ```
 cd path/to/danswer
 ```
-5. Set Python Path variable:
-```
-export PYTHONPATH=$PYTHONPATH:$PWD/backend
-```
 6. Navigate to the answer_quality folder:
 ```
 cd backend/tests/regression/answer_quality
--- a/backend/tests/regression/answer_quality/api_utils.py
+++ b/backend/tests/regression/answer_quality/api_utils.py
@ -66,7 +66,6 @@ def get_answer_from_query(
    except Exception as e:
        print("Failed to answer the questions:")
        print(f"\t {str(e)}")
-        print("Try restarting vespa container and trying agian")
        raise e

    return context_data_list, answer
--- a/backend/tests/regression/answer_quality/file_uploader.py
+++ b/backend/tests/regression/answer_quality/file_uploader.py
@ -52,6 +52,7 @@ def upload_test_files(zip_file_path: str, env_name: str) -> None:


 def manage_file_upload(zip_file_path: str, env_name: str) -> None:
+    start_time = time.time()
    unzipped_file_paths = unzip_and_get_file_paths(zip_file_path)
    total_file_count = len(unzipped_file_paths)
    problem_file_list: list[str] = []
@ -84,15 +85,17 @@ def manage_file_upload(zip_file_path: str, env_name: str) -> None:

        time.sleep(10)

-    problem_file_csv_path = os.path.join(current_dir, "problem_files.csv")
-    with open(problem_file_csv_path, "w", newline="") as csvfile:
-        csvwriter = csv.writer(csvfile)
-        csvwriter.writerow(["Problematic File Paths"])
-        for problem_file in problem_file_list:
-            csvwriter.writerow([problem_file])
+    if problem_file_list:
+        problem_file_csv_path = os.path.join(current_dir, "problem_files.csv")
+        with open(problem_file_csv_path, "w", newline="") as csvfile:
+            csvwriter = csv.writer(csvfile)
+            csvwriter.writerow(["Problematic File Paths"])
+            for problem_file in problem_file_list:
+                csvwriter.writerow([problem_file])

    for file in unzipped_file_paths:
        os.unlink(file)
+    print(f"Total time taken: {(time.time() - start_time)/60} minutes")


 if __name__ == "__main__":