danswer/backend/tests/regression/answer_quality/file_uploader.py

import csv
import os
import tempfile
import time
import zipfile
from pathlib import Path
from types import SimpleNamespace

import yaml

from tests.regression.answer_quality.api_utils import check_indexing_status
from tests.regression.answer_quality.api_utils import create_cc_pair
from tests.regression.answer_quality.api_utils import create_connector
from tests.regression.answer_quality.api_utils import create_credential
from tests.regression.answer_quality.api_utils import run_cc_once
from tests.regression.answer_quality.api_utils import upload_file


def unzip_and_get_file_paths(zip_file_path: str) -> list[str]:
    persistent_dir = tempfile.mkdtemp()
    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
        zip_ref.extractall(persistent_dir)

    file_paths = []
    for root, _, files in os.walk(persistent_dir):
        for file in sorted(files):
            file_paths.append(os.path.join(root, file))

    return file_paths


def create_temp_zip_from_files(file_paths: list[str]) -> str:
    persistent_dir = tempfile.mkdtemp()
    zip_file_path = os.path.join(persistent_dir, "temp.zip")

    with zipfile.ZipFile(zip_file_path, "w") as zip_file:
        for file_path in file_paths:
            zip_file.write(file_path, Path(file_path).name)

    return zip_file_path


def upload_test_files(zip_file_path: str, env_name: str) -> None:
    print("zip:", zip_file_path)
    file_paths = upload_file(env_name, zip_file_path)

    conn_id = create_connector(env_name, file_paths)
    cred_id = create_credential(env_name)

    create_cc_pair(env_name, conn_id, cred_id)
    run_cc_once(env_name, conn_id, cred_id)


def manage_file_upload(zip_file_path: str, env_name: str) -> None:
    start_time = time.time()
    unzipped_file_paths = unzip_and_get_file_paths(zip_file_path)
    total_file_count = len(unzipped_file_paths)
    problem_file_list: list[str] = []

    while True:
        doc_count, ongoing_index_attempts = check_indexing_status(env_name)

        if ongoing_index_attempts:
            print(
                f"{doc_count} docs indexed but waiting for ongoing indexing jobs to finish..."
            )
        elif not doc_count:
            print("No docs indexed, waiting for indexing to start")
            temp_zip_file_path = create_temp_zip_from_files(unzipped_file_paths)
            upload_test_files(temp_zip_file_path, env_name)
            os.unlink(temp_zip_file_path)
        elif (doc_count + len(problem_file_list)) < total_file_count:
            print(f"No ongooing indexing attempts but only {doc_count} docs indexed")
            remaining_files = unzipped_file_paths[doc_count + len(problem_file_list) :]
            problem_file_list.append(remaining_files.pop(0))
            print(
                f"Removing first doc and grabbed last {len(remaining_files)} docs to try agian"
            )
            temp_zip_file_path = create_temp_zip_from_files(remaining_files)
            upload_test_files(temp_zip_file_path, env_name)
            os.unlink(temp_zip_file_path)
        else:
            print(f"Successfully uploaded {doc_count} docs!")
            break

        time.sleep(10)

    if problem_file_list:
        problem_file_csv_path = os.path.join(current_dir, "problem_files.csv")
        with open(problem_file_csv_path, "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(["Problematic File Paths"])
            for problem_file in problem_file_list:
                csvwriter.writerow([problem_file])

    for file in unzipped_file_paths:
        os.unlink(file)
    print(f"Total time taken: {(time.time() - start_time)/60} minutes")


if __name__ == "__main__":
    current_dir = os.path.dirname(os.path.abspath(__file__))
    config_path = os.path.join(current_dir, "search_test_config.yaml")
    with open(config_path, "r") as file:
        config = SimpleNamespace(**yaml.safe_load(file))
    file_location = config.zipped_documents_file
    env_name = config.environment_name
    manage_file_upload(file_location, env_name)