mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-04 11:41:04 +02:00
Introduce Time Filters (#610)
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
# This file is purely for development use, not included in any builds
|
||||
import subprocess
|
||||
import threading
|
||||
|
||||
|
@ -1,25 +0,0 @@
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.utils.clients import get_typesense_client
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ts_client = get_typesense_client()
|
||||
|
||||
page_number = 1
|
||||
per_page = 100 # number of documents to retrieve per page
|
||||
while True:
|
||||
params = {
|
||||
"q": "",
|
||||
"query_by": "content",
|
||||
"page": page_number,
|
||||
"per_page": per_page,
|
||||
}
|
||||
response = ts_client.collections[DOCUMENT_INDEX_NAME].documents.search(params)
|
||||
documents = response.get("hits")
|
||||
if not documents:
|
||||
break # if there are no more documents, break out of the loop
|
||||
|
||||
for document in documents:
|
||||
print(document)
|
||||
|
||||
page_number += 1 # move on to the next page
|
@ -1,46 +1,13 @@
|
||||
# This file is purely for development use, not included in any builds
|
||||
import requests
|
||||
from qdrant_client.http.models import Distance
|
||||
from qdrant_client.http.models import VectorParams
|
||||
from typesense.exceptions import ObjectNotFound # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from danswer.datastores.document_index import get_default_document_index
|
||||
from danswer.datastores.document_index import SplitDocumentIndex
|
||||
from danswer.datastores.typesense.store import create_typesense_collection
|
||||
from danswer.datastores.vespa.store import DOCUMENT_ID_ENDPOINT
|
||||
from danswer.datastores.vespa.store import VespaIndex
|
||||
from danswer.utils.clients import get_qdrant_client
|
||||
from danswer.utils.clients import get_typesense_client
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def recreate_qdrant_collection(
|
||||
collection_name: str, embedding_dim: int = DOC_EMBEDDING_DIM
|
||||
) -> None:
|
||||
logger.info(f"Attempting to recreate Qdrant collection {collection_name}")
|
||||
result = get_qdrant_client().recreate_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
|
||||
)
|
||||
if not result:
|
||||
raise RuntimeError("Could not create Qdrant collection")
|
||||
|
||||
|
||||
def recreate_typesense_collection(collection_name: str) -> None:
|
||||
logger.info(f"Attempting to recreate Typesense collection {collection_name}")
|
||||
ts_client = get_typesense_client()
|
||||
try:
|
||||
ts_client.collections[collection_name].delete()
|
||||
except ObjectNotFound:
|
||||
logger.debug(f"Collection {collection_name} does not already exist")
|
||||
|
||||
create_typesense_collection(collection_name)
|
||||
|
||||
|
||||
def wipe_vespa_index() -> None:
|
||||
params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
|
||||
response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
|
||||
@ -48,9 +15,4 @@ def wipe_vespa_index() -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
document_index = get_default_document_index()
|
||||
if isinstance(document_index, SplitDocumentIndex):
|
||||
recreate_qdrant_collection("danswer_index")
|
||||
recreate_typesense_collection("danswer_index")
|
||||
elif isinstance(document_index, VespaIndex):
|
||||
wipe_vespa_index()
|
||||
wipe_vespa_index()
|
||||
|
@ -3,28 +3,17 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from qdrant_client.http.models.models import SnapshotDescription
|
||||
from typesense.exceptions import ObjectNotFound # type: ignore
|
||||
|
||||
from alembic import command
|
||||
from alembic.config import Config
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.configs.app_configs import POSTGRES_DB
|
||||
from danswer.configs.app_configs import POSTGRES_HOST
|
||||
from danswer.configs.app_configs import POSTGRES_PASSWORD
|
||||
from danswer.configs.app_configs import POSTGRES_PORT
|
||||
from danswer.configs.app_configs import POSTGRES_USER
|
||||
from danswer.configs.app_configs import QDRANT_HOST
|
||||
from danswer.configs.app_configs import QDRANT_PORT
|
||||
from danswer.datastores.qdrant.utils import create_qdrant_collection
|
||||
from danswer.datastores.qdrant.utils import list_qdrant_collections
|
||||
from danswer.datastores.typesense.store import create_typesense_collection
|
||||
from danswer.datastores.vespa.store import DOCUMENT_ID_ENDPOINT
|
||||
from danswer.utils.clients import get_qdrant_client
|
||||
from danswer.utils.clients import get_typesense_client
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -51,80 +40,6 @@ def load_postgres(filename: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def snapshot_time_compare(snap: SnapshotDescription) -> datetime:
|
||||
if not hasattr(snap, "creation_time") or snap.creation_time is None:
|
||||
raise RuntimeError("Qdrant Snapshots Failed")
|
||||
return datetime.strptime(snap.creation_time, "%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
|
||||
def save_qdrant(filename: str) -> None:
|
||||
logger.info("Attempting to take Qdrant snapshot")
|
||||
qdrant_client = get_qdrant_client()
|
||||
qdrant_client.create_snapshot(collection_name=DOCUMENT_INDEX_NAME)
|
||||
snapshots = qdrant_client.list_snapshots(collection_name=DOCUMENT_INDEX_NAME)
|
||||
valid_snapshots = [snap for snap in snapshots if snap.creation_time is not None]
|
||||
|
||||
sorted_snapshots = sorted(valid_snapshots, key=snapshot_time_compare)
|
||||
last_snapshot_name = sorted_snapshots[-1].name
|
||||
url = f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{DOCUMENT_INDEX_NAME}/snapshots/{last_snapshot_name}"
|
||||
|
||||
response = requests.get(url, stream=True)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError("Qdrant Save Failed")
|
||||
|
||||
with open(filename, "wb") as file:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
file.write(chunk)
|
||||
|
||||
|
||||
def load_qdrant(filename: str) -> None:
|
||||
logger.info("Attempting to load Qdrant snapshot")
|
||||
if DOCUMENT_INDEX_NAME not in {
|
||||
collection.name for collection in list_qdrant_collections().collections
|
||||
}:
|
||||
create_qdrant_collection(DOCUMENT_INDEX_NAME)
|
||||
snapshot_url = f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{DOCUMENT_INDEX_NAME}/snapshots/"
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
files = {"snapshot": (os.path.basename(filename), f)}
|
||||
response = requests.post(snapshot_url + "upload", files=files)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError("Qdrant Snapshot Upload Failed")
|
||||
|
||||
data = {"location": snapshot_url + os.path.basename(filename)}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
response = requests.put(
|
||||
snapshot_url + "recover", data=json.dumps(data), headers=headers
|
||||
)
|
||||
if response.status_code != 200:
|
||||
raise RuntimeError("Loading Qdrant Snapshot Failed")
|
||||
|
||||
|
||||
def save_typesense(filename: str) -> None:
|
||||
logger.info("Attempting to take Typesense snapshot")
|
||||
ts_client = get_typesense_client()
|
||||
all_docs = ts_client.collections[DOCUMENT_INDEX_NAME].documents.export()
|
||||
with open(filename, "w") as f:
|
||||
f.write(all_docs)
|
||||
|
||||
|
||||
def load_typesense(filename: str) -> None:
|
||||
logger.info("Attempting to load Typesense snapshot")
|
||||
ts_client = get_typesense_client()
|
||||
try:
|
||||
ts_client.collections[DOCUMENT_INDEX_NAME].delete()
|
||||
except ObjectNotFound:
|
||||
pass
|
||||
|
||||
create_typesense_collection(DOCUMENT_INDEX_NAME)
|
||||
|
||||
with open(filename) as jsonl_file:
|
||||
ts_client.collections[DOCUMENT_INDEX_NAME].documents.import_(
|
||||
jsonl_file.read().encode("utf-8"), {"action": "create"}
|
||||
)
|
||||
|
||||
|
||||
def save_vespa(filename: str) -> None:
|
||||
logger.info("Attempting to take Vespa snapshot")
|
||||
continuation = ""
|
||||
@ -189,10 +104,6 @@ if __name__ == "__main__":
|
||||
if args.load:
|
||||
load_postgres(os.path.join(checkpoint_dir, "postgres_snapshot.tar"))
|
||||
load_vespa(os.path.join(checkpoint_dir, "vespa_snapshot.jsonl"))
|
||||
# load_qdrant(os.path.join(checkpoint_dir, "qdrant.snapshot"))
|
||||
# load_typesense(os.path.join(checkpoint_dir, "typesense_snapshot.jsonl"))
|
||||
else:
|
||||
save_postgres(os.path.join(checkpoint_dir, "postgres_snapshot.tar"))
|
||||
save_vespa(os.path.join(checkpoint_dir, "vespa_snapshot.jsonl"))
|
||||
# save_qdrant(os.path.join(checkpoint_dir, "qdrant.snapshot"))
|
||||
# save_typesense(os.path.join(checkpoint_dir, "typesense_snapshot.jsonl"))
|
||||
|
Reference in New Issue
Block a user