mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 12:29:41 +02:00
DAN-81 Improve search round 2 (#82)
Includes: - Multi vector indexing/search - Ensemble model reranking - Keyword Search backend
This commit is contained in:
39
backend/scripts/reset_indexes.py
Normal file
39
backend/scripts/reset_indexes.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is purely for development use, not included in any builds
|
||||
from danswer.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from danswer.datastores.typesense.store import create_typesense_collection
|
||||
from danswer.utils.clients import get_qdrant_client
|
||||
from danswer.utils.clients import get_typesense_client
|
||||
from danswer.utils.logging import setup_logger
|
||||
from qdrant_client.http.models import Distance
|
||||
from qdrant_client.http.models import VectorParams
|
||||
from typesense.exceptions import ObjectNotFound # type: ignore
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def recreate_qdrant_collection(
|
||||
collection_name: str, embedding_dim: int = DOC_EMBEDDING_DIM
|
||||
) -> None:
|
||||
logger.info(f"Attempting to recreate Qdrant collection {collection_name}")
|
||||
result = get_qdrant_client().recreate_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
|
||||
)
|
||||
if not result:
|
||||
raise RuntimeError("Could not create Qdrant collection")
|
||||
|
||||
|
||||
def recreate_typesense_collection(collection_name: str) -> None:
|
||||
logger.info(f"Attempting to recreate Typesense collection {collection_name}")
|
||||
ts_client = get_typesense_client()
|
||||
try:
|
||||
ts_client.collections[collection_name].delete()
|
||||
except ObjectNotFound:
|
||||
logger.debug(f"Collection {collection_name} does not already exist")
|
||||
|
||||
create_typesense_collection(collection_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
recreate_qdrant_collection("danswer_index")
|
||||
recreate_typesense_collection("danswer_index")
|
@@ -1,13 +1,12 @@
|
||||
# This file is purely for development use, not included in any builds
|
||||
import argparse
|
||||
import json
|
||||
import urllib
|
||||
from pprint import pprint
|
||||
|
||||
import requests
|
||||
from danswer.configs.app_configs import APP_PORT
|
||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINK
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
|
||||
|
||||
@@ -16,35 +15,44 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-k",
|
||||
"--keyword-search",
|
||||
action="store_true",
|
||||
help="Use keyword search if set, semantic search otherwise",
|
||||
"-f",
|
||||
"--flow",
|
||||
type=str,
|
||||
default="QA",
|
||||
help='"Search" or "QA", defaults to "QA"',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--source-types",
|
||||
"--type",
|
||||
type=str,
|
||||
help="Comma separated list of source types to filter by",
|
||||
default="Semantic",
|
||||
help='"Semantic" or "Keyword", defaults to "Semantic"',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--stream",
|
||||
action="store_true",
|
||||
help="Enable streaming response",
|
||||
help='Enable streaming response, only for flow="QA"',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--filters",
|
||||
type=str,
|
||||
help="Comma separated list of source types to filter by (no spaces)",
|
||||
)
|
||||
|
||||
parser.add_argument("query", nargs="*", help="The query to process")
|
||||
|
||||
previous_input = None
|
||||
while True:
|
||||
try:
|
||||
user_input = input(
|
||||
"\n\nAsk any question:\n"
|
||||
" - prefix with -t to add a filter by source type(s)\n"
|
||||
" - prefix with -s to stream answer\n"
|
||||
" - input an empty string to rerun last query\n\t"
|
||||
" - Use -f (QA/Search) and -t (Semantic/Keyword) flags to set endpoint.\n"
|
||||
" - prefix with -s to stream answer, --filters web,slack etc. for filters.\n"
|
||||
" - input an empty string to rerun last query.\n\t"
|
||||
)
|
||||
|
||||
if user_input:
|
||||
@@ -58,62 +66,51 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args(user_input.split())
|
||||
|
||||
keyword_search = args.keyword_search
|
||||
source_types = args.source_types.split(",") if args.source_types else None
|
||||
flow = str(args.flow).lower()
|
||||
flow_type = str(args.type).lower()
|
||||
stream = args.stream
|
||||
source_types = args.filters.split(",") if args.filters else None
|
||||
if source_types and len(source_types) == 1:
|
||||
source_types = source_types[0]
|
||||
query = " ".join(args.query)
|
||||
|
||||
endpoint = (
|
||||
f"http://127.0.0.1:{APP_PORT}/direct-qa"
|
||||
if not args.stream
|
||||
else f"http://127.0.0.1:{APP_PORT}/stream-direct-qa"
|
||||
)
|
||||
if args.keyword_search:
|
||||
endpoint = f"http://127.0.0.1:{APP_PORT}/keyword-search"
|
||||
raise NotImplementedError("keyword search is not supported for now")
|
||||
if flow not in ["qa", "search"]:
|
||||
raise ValueError("Flow value must be QA or Search")
|
||||
if flow_type not in ["keyword", "semantic"]:
|
||||
raise ValueError("Type value must be keyword or semantic")
|
||||
if flow != "qa" and stream:
|
||||
raise ValueError("Can only stream results for QA")
|
||||
|
||||
if (flow, flow_type) == ("search", "keyword"):
|
||||
path = "keyword-search"
|
||||
elif (flow, flow_type) == ("search", "semantic"):
|
||||
path = "semantic-search"
|
||||
elif stream:
|
||||
path = "stream-direct-qa"
|
||||
else:
|
||||
path = "direct-qa"
|
||||
|
||||
endpoint = f"http://127.0.0.1:{APP_PORT}/{path}"
|
||||
|
||||
query_json = {
|
||||
"query": query,
|
||||
"collection": QDRANT_DEFAULT_COLLECTION,
|
||||
"filters": [{SOURCE_TYPE: source_types}],
|
||||
"use_keyword": flow_type == "keyword", # Ignore if not QA Endpoints
|
||||
"filters": json.dumps([{SOURCE_TYPE: source_types}]),
|
||||
}
|
||||
if not args.stream:
|
||||
response = requests.get(
|
||||
endpoint, params=urllib.parse.urlencode(query_json)
|
||||
)
|
||||
contents = json.loads(response.content)
|
||||
if keyword_search:
|
||||
if contents["results"]:
|
||||
for link in contents["results"]:
|
||||
print(link)
|
||||
else:
|
||||
print("No matches found")
|
||||
else:
|
||||
answer = contents.get("answer")
|
||||
if answer:
|
||||
print("Answer: " + answer)
|
||||
else:
|
||||
print("Answer: ?")
|
||||
if contents.get("quotes"):
|
||||
for ind, (quote, quote_info) in enumerate(
|
||||
contents["quotes"].items()
|
||||
):
|
||||
print(f"Quote {str(ind + 1)}:\n{quote}")
|
||||
print(
|
||||
f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}"
|
||||
)
|
||||
print(f"Blurb: {quote_info[BLURB]}")
|
||||
print(f"Link: {quote_info[SOURCE_LINK]}")
|
||||
print(f"Source: {quote_info[SOURCE_TYPE]}")
|
||||
else:
|
||||
print("No quotes found")
|
||||
else:
|
||||
|
||||
if args.stream:
|
||||
with requests.get(
|
||||
endpoint, params=urllib.parse.urlencode(query_json), stream=True
|
||||
) as r:
|
||||
for json_response in r.iter_lines():
|
||||
print(json.loads(json_response.decode()))
|
||||
pprint(json.loads(json_response.decode()))
|
||||
else:
|
||||
response = requests.get(
|
||||
endpoint, params=urllib.parse.urlencode(query_json)
|
||||
)
|
||||
contents = json.loads(response.content)
|
||||
pprint(contents)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed due to {e}, retrying")
|
||||
|
Reference in New Issue
Block a user