DAN-81 Improve search round 2 (#82)

Includes:
- Multi vector indexing/search
- Ensemble model reranking
- Keyword Search backend
This commit is contained in:
Yuhong Sun
2023-06-04 20:02:32 -07:00
committed by GitHub
parent 7cc64efc3a
commit c4e8afe4d2
35 changed files with 1223 additions and 863 deletions

View File

@@ -0,0 +1,39 @@
# This file is purely for development use, not included in any builds
from danswer.configs.model_configs import DOC_EMBEDDING_DIM
from danswer.datastores.typesense.store import create_typesense_collection
from danswer.utils.clients import get_qdrant_client
from danswer.utils.clients import get_typesense_client
from danswer.utils.logging import setup_logger
from qdrant_client.http.models import Distance
from qdrant_client.http.models import VectorParams
from typesense.exceptions import ObjectNotFound # type: ignore
logger = setup_logger()
def recreate_qdrant_collection(
collection_name: str, embedding_dim: int = DOC_EMBEDDING_DIM
) -> None:
logger.info(f"Attempting to recreate Qdrant collection {collection_name}")
result = get_qdrant_client().recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
)
if not result:
raise RuntimeError("Could not create Qdrant collection")
def recreate_typesense_collection(collection_name: str) -> None:
logger.info(f"Attempting to recreate Typesense collection {collection_name}")
ts_client = get_typesense_client()
try:
ts_client.collections[collection_name].delete()
except ObjectNotFound:
logger.debug(f"Collection {collection_name} does not already exist")
create_typesense_collection(collection_name)
if __name__ == "__main__":
recreate_qdrant_collection("danswer_index")
recreate_typesense_collection("danswer_index")

View File

@@ -1,13 +1,12 @@
# This file is purely for development use, not included in any builds
import argparse
import json
import urllib
from pprint import pprint
import requests
from danswer.configs.app_configs import APP_PORT
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
from danswer.configs.constants import BLURB
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINK
from danswer.configs.constants import SOURCE_TYPE
@@ -16,35 +15,44 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-k",
"--keyword-search",
action="store_true",
help="Use keyword search if set, semantic search otherwise",
"-f",
"--flow",
type=str,
default="QA",
help='"Search" or "QA", defaults to "QA"',
)
parser.add_argument(
"-t",
"--source-types",
"--type",
type=str,
help="Comma separated list of source types to filter by",
default="Semantic",
help='"Semantic" or "Keyword", defaults to "Semantic"',
)
parser.add_argument(
"-s",
"--stream",
action="store_true",
help="Enable streaming response",
help='Enable streaming response, only for flow="QA"',
)
parser.add_argument(
"--filters",
type=str,
help="Comma separated list of source types to filter by (no spaces)",
)
parser.add_argument("query", nargs="*", help="The query to process")
previous_input = None
while True:
try:
user_input = input(
"\n\nAsk any question:\n"
" - prefix with -t to add a filter by source type(s)\n"
" - prefix with -s to stream answer\n"
" - input an empty string to rerun last query\n\t"
" - Use -f (QA/Search) and -t (Semantic/Keyword) flags to set endpoint.\n"
" - prefix with -s to stream answer, --filters web,slack etc. for filters.\n"
" - input an empty string to rerun last query.\n\t"
)
if user_input:
@@ -58,62 +66,51 @@ if __name__ == "__main__":
args = parser.parse_args(user_input.split())
keyword_search = args.keyword_search
source_types = args.source_types.split(",") if args.source_types else None
flow = str(args.flow).lower()
flow_type = str(args.type).lower()
stream = args.stream
source_types = args.filters.split(",") if args.filters else None
if source_types and len(source_types) == 1:
source_types = source_types[0]
query = " ".join(args.query)
endpoint = (
f"http://127.0.0.1:{APP_PORT}/direct-qa"
if not args.stream
else f"http://127.0.0.1:{APP_PORT}/stream-direct-qa"
)
if args.keyword_search:
endpoint = f"http://127.0.0.1:{APP_PORT}/keyword-search"
raise NotImplementedError("keyword search is not supported for now")
if flow not in ["qa", "search"]:
raise ValueError("Flow value must be QA or Search")
if flow_type not in ["keyword", "semantic"]:
raise ValueError("Type value must be keyword or semantic")
if flow != "qa" and stream:
raise ValueError("Can only stream results for QA")
if (flow, flow_type) == ("search", "keyword"):
path = "keyword-search"
elif (flow, flow_type) == ("search", "semantic"):
path = "semantic-search"
elif stream:
path = "stream-direct-qa"
else:
path = "direct-qa"
endpoint = f"http://127.0.0.1:{APP_PORT}/{path}"
query_json = {
"query": query,
"collection": QDRANT_DEFAULT_COLLECTION,
"filters": [{SOURCE_TYPE: source_types}],
"use_keyword": flow_type == "keyword", # Ignore if not QA Endpoints
"filters": json.dumps([{SOURCE_TYPE: source_types}]),
}
if not args.stream:
response = requests.get(
endpoint, params=urllib.parse.urlencode(query_json)
)
contents = json.loads(response.content)
if keyword_search:
if contents["results"]:
for link in contents["results"]:
print(link)
else:
print("No matches found")
else:
answer = contents.get("answer")
if answer:
print("Answer: " + answer)
else:
print("Answer: ?")
if contents.get("quotes"):
for ind, (quote, quote_info) in enumerate(
contents["quotes"].items()
):
print(f"Quote {str(ind + 1)}:\n{quote}")
print(
f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}"
)
print(f"Blurb: {quote_info[BLURB]}")
print(f"Link: {quote_info[SOURCE_LINK]}")
print(f"Source: {quote_info[SOURCE_TYPE]}")
else:
print("No quotes found")
else:
if args.stream:
with requests.get(
endpoint, params=urllib.parse.urlencode(query_json), stream=True
) as r:
for json_response in r.iter_lines():
print(json.loads(json_response.decode()))
pprint(json.loads(json_response.decode()))
else:
response = requests.get(
endpoint, params=urllib.parse.urlencode(query_json)
)
contents = json.loads(response.content)
pprint(contents)
except Exception as e:
print(f"Failed due to {e}, retrying")