2024-08-14 22:18:53 -07:00

109 lines
4.0 KiB
Python

import csv
import os
import tempfile
import time
import zipfile
from pathlib import Path
from types import SimpleNamespace
import yaml
from tests.regression.answer_quality.api_utils import check_indexing_status
from tests.regression.answer_quality.api_utils import create_cc_pair
from tests.regression.answer_quality.api_utils import create_connector
from tests.regression.answer_quality.api_utils import create_credential
from tests.regression.answer_quality.api_utils import run_cc_once
from tests.regression.answer_quality.api_utils import upload_file
def unzip_and_get_file_paths(zip_file_path: str) -> list[str]:
persistent_dir = tempfile.mkdtemp()
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(persistent_dir)
file_paths = []
for root, _, files in os.walk(persistent_dir):
for file in sorted(files):
file_paths.append(os.path.join(root, file))
return file_paths
def create_temp_zip_from_files(file_paths: list[str]) -> str:
persistent_dir = tempfile.mkdtemp()
zip_file_path = os.path.join(persistent_dir, "temp.zip")
with zipfile.ZipFile(zip_file_path, "w") as zip_file:
for file_path in file_paths:
zip_file.write(file_path, Path(file_path).name)
return zip_file_path
def upload_test_files(zip_file_path: str, env_name: str) -> None:
print("zip:", zip_file_path)
file_paths = upload_file(env_name, zip_file_path)
conn_id = create_connector(env_name, file_paths)
cred_id = create_credential(env_name)
create_cc_pair(env_name, conn_id, cred_id)
run_cc_once(env_name, conn_id, cred_id)
def manage_file_upload(zip_file_path: str, env_name: str) -> None:
start_time = time.time()
unzipped_file_paths = unzip_and_get_file_paths(zip_file_path)
total_file_count = len(unzipped_file_paths)
problem_file_list: list[str] = []
while True:
doc_count, ongoing_index_attempts = check_indexing_status(env_name)
if ongoing_index_attempts:
print(
f"{doc_count} docs indexed but waiting for ongoing indexing jobs to finish..."
)
elif not doc_count:
print("No docs indexed, waiting for indexing to start")
temp_zip_file_path = create_temp_zip_from_files(unzipped_file_paths)
upload_test_files(temp_zip_file_path, env_name)
os.unlink(temp_zip_file_path)
elif (doc_count + len(problem_file_list)) < total_file_count:
print(f"No ongooing indexing attempts but only {doc_count} docs indexed")
remaining_files = unzipped_file_paths[doc_count + len(problem_file_list) :]
problem_file_list.append(remaining_files.pop(0))
print(
f"Removing first doc and grabbed last {len(remaining_files)} docs to try agian"
)
temp_zip_file_path = create_temp_zip_from_files(remaining_files)
upload_test_files(temp_zip_file_path, env_name)
os.unlink(temp_zip_file_path)
else:
print(f"Successfully uploaded {doc_count} docs!")
break
time.sleep(10)
if problem_file_list:
problem_file_csv_path = os.path.join(current_dir, "problem_files.csv")
with open(problem_file_csv_path, "w", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["Problematic File Paths"])
for problem_file in problem_file_list:
csvwriter.writerow([problem_file])
for file in unzipped_file_paths:
os.unlink(file)
print(f"Total time taken: {(time.time() - start_time)/60} minutes")
if __name__ == "__main__":
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = os.path.join(current_dir, "search_test_config.yaml")
with open(config_path, "r") as file:
config = SimpleNamespace(**yaml.safe_load(file))
file_location = config.zipped_documents_file
env_name = config.environment_name
manage_file_upload(file_location, env_name)