mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-28 17:01:10 +02:00
updated logging and basic search expansion procedure
This commit is contained in:
parent
85fa083717
commit
669b668463
@ -293,6 +293,10 @@ def choose_tool(
|
|||||||
semantic_expansions=[semantic_expansion],
|
semantic_expansions=[semantic_expansion],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(f"Original query: {agent_config.inputs.search_request.query}")
|
||||||
|
logger.info(f"Expanded keyword queries: {keyword_expansion}")
|
||||||
|
logger.info(f"Expanded semantic queries: {semantic_expansion}")
|
||||||
|
|
||||||
return ToolChoiceUpdate(
|
return ToolChoiceUpdate(
|
||||||
tool_choice=ToolChoice(
|
tool_choice=ToolChoice(
|
||||||
tool=selected_tool,
|
tool=selected_tool,
|
||||||
|
@ -213,8 +213,12 @@ def _apply_pruning(
|
|||||||
try:
|
try:
|
||||||
logger.debug(f"Number of documents after pruning: {ind + 1}")
|
logger.debug(f"Number of documents after pruning: {ind + 1}")
|
||||||
logger.debug("Number of tokens per document (pruned):")
|
logger.debug("Number of tokens per document (pruned):")
|
||||||
|
|
||||||
|
log_tokens_per_document: dict[int, int] = {}
|
||||||
for x, y in section_idx_token_count.items():
|
for x, y in section_idx_token_count.items():
|
||||||
logger.debug(f"{x + 1}: {y}")
|
log_tokens_per_document[x + 1] = y
|
||||||
|
logger.debug(f"Tokens per document: {log_tokens_per_document}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error logging prune statistics: {e}")
|
logger.error(f"Error logging prune statistics: {e}")
|
||||||
|
|
||||||
@ -421,8 +425,14 @@ def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Number of chunks per document (new ranking):")
|
logger.debug("Number of chunks per document (new ranking):")
|
||||||
|
|
||||||
|
log_chunks_per_document: dict[int, int] = {}
|
||||||
|
|
||||||
for x, y in enumerate(new_sections):
|
for x, y in enumerate(new_sections):
|
||||||
logger.debug(f"{x + 1}: {len(y.chunks)}")
|
log_chunks_per_document[x + 1] = len(y.chunks)
|
||||||
|
|
||||||
|
logger.debug(f"Chunks per document: {log_chunks_per_document}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error logging merge statistics: {e}")
|
logger.error(f"Error logging merge statistics: {e}")
|
||||||
|
|
||||||
|
@ -161,8 +161,12 @@ def doc_index_retrieval(
|
|||||||
|
|
||||||
keyword_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
|
keyword_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
|
||||||
semantic_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
|
semantic_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
|
||||||
top_base_chunks_thread: TimeoutThread[list[InferenceChunkUncleaned]] | None = None
|
top_base_chunks_standard_ranking_thread: (
|
||||||
|
TimeoutThread[list[InferenceChunkUncleaned]] | None
|
||||||
|
) = None
|
||||||
|
top_base_chunks_keyword_ranking_thread: (
|
||||||
|
TimeoutThread[list[InferenceChunkUncleaned]] | None
|
||||||
|
) = None
|
||||||
top_semantic_chunks_thread: TimeoutThread[list[InferenceChunkUncleaned]] | None = (
|
top_semantic_chunks_thread: TimeoutThread[list[InferenceChunkUncleaned]] | None = (
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
@ -173,7 +177,7 @@ def doc_index_retrieval(
|
|||||||
top_semantic_chunks: list[InferenceChunkUncleaned] | None = None
|
top_semantic_chunks: list[InferenceChunkUncleaned] | None = None
|
||||||
|
|
||||||
# original retrieveal method
|
# original retrieveal method
|
||||||
top_base_chunks_thread = run_in_background(
|
top_base_chunks_standard_ranking_thread = run_in_background(
|
||||||
document_index.hybrid_retrieval,
|
document_index.hybrid_retrieval,
|
||||||
query.query,
|
query.query,
|
||||||
query_embedding,
|
query_embedding,
|
||||||
@ -182,7 +186,21 @@ def doc_index_retrieval(
|
|||||||
query.hybrid_alpha,
|
query.hybrid_alpha,
|
||||||
query.recency_bias_multiplier,
|
query.recency_bias_multiplier,
|
||||||
query.num_hits,
|
query.num_hits,
|
||||||
"semantic",
|
QueryExpansionType.SEMANTIC,
|
||||||
|
query.offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
# same query but with 1st vespa phase as keyword retrieval
|
||||||
|
top_base_chunks_keyword_ranking_thread = run_in_background(
|
||||||
|
document_index.hybrid_retrieval,
|
||||||
|
query.query,
|
||||||
|
query_embedding,
|
||||||
|
query.processed_keywords,
|
||||||
|
query.filters,
|
||||||
|
query.hybrid_alpha,
|
||||||
|
query.recency_bias_multiplier,
|
||||||
|
query.num_hits,
|
||||||
|
QueryExpansionType.KEYWORD,
|
||||||
query.offset,
|
query.offset,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -243,7 +261,12 @@ def doc_index_retrieval(
|
|||||||
query.offset,
|
query.offset,
|
||||||
)
|
)
|
||||||
|
|
||||||
top_base_chunks = wait_on_background(top_base_chunks_thread)
|
top_base_chunks_standard_ranking = wait_on_background(
|
||||||
|
top_base_chunks_standard_ranking_thread
|
||||||
|
)
|
||||||
|
top_base_chunks_keyword_ranking = wait_on_background(
|
||||||
|
top_base_chunks_keyword_ranking_thread
|
||||||
|
)
|
||||||
|
|
||||||
top_keyword_chunks = wait_on_background(top_keyword_chunks_thread)
|
top_keyword_chunks = wait_on_background(top_keyword_chunks_thread)
|
||||||
|
|
||||||
@ -251,7 +274,11 @@ def doc_index_retrieval(
|
|||||||
assert top_semantic_chunks_thread is not None
|
assert top_semantic_chunks_thread is not None
|
||||||
top_semantic_chunks = wait_on_background(top_semantic_chunks_thread)
|
top_semantic_chunks = wait_on_background(top_semantic_chunks_thread)
|
||||||
|
|
||||||
all_top_chunks = top_base_chunks + top_keyword_chunks
|
all_top_chunks = (
|
||||||
|
top_base_chunks_standard_ranking
|
||||||
|
+ top_base_chunks_keyword_ranking
|
||||||
|
+ top_keyword_chunks
|
||||||
|
)
|
||||||
|
|
||||||
# use all three retrieval methods to retrieve top chunks
|
# use all three retrieval methods to retrieve top chunks
|
||||||
|
|
||||||
@ -263,8 +290,17 @@ def doc_index_retrieval(
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
top_base_chunks = wait_on_background(top_base_chunks_thread)
|
top_base_chunks_standard_ranking = wait_on_background(
|
||||||
top_chunks = _dedupe_chunks(top_base_chunks)
|
top_base_chunks_standard_ranking_thread
|
||||||
|
)
|
||||||
|
top_base_chunks_keyword_ranking = wait_on_background(
|
||||||
|
top_base_chunks_keyword_ranking_thread
|
||||||
|
)
|
||||||
|
top_chunks = _dedupe_chunks(
|
||||||
|
top_base_chunks_standard_ranking + top_base_chunks_keyword_ranking
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Overall number of top initial retrieval chunks: {len(top_chunks)}")
|
||||||
|
|
||||||
retrieval_requests: list[VespaChunkRequest] = []
|
retrieval_requests: list[VespaChunkRequest] = []
|
||||||
normal_chunks: list[InferenceChunkUncleaned] = []
|
normal_chunks: list[InferenceChunkUncleaned] = []
|
||||||
|
@ -358,7 +358,7 @@ def query_vespa(
|
|||||||
num_retrieved_document_ids = len(
|
num_retrieved_document_ids = len(
|
||||||
set([chunk.document_id for chunk in inference_chunks])
|
set([chunk.document_id for chunk in inference_chunks])
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.info(
|
||||||
f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
|
f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -822,6 +822,8 @@ class VespaIndex(DocumentIndex):
|
|||||||
else:
|
else:
|
||||||
ranking_profile = f"hybrid_search_semantic_base_{len(query_embedding)}"
|
ranking_profile = f"hybrid_search_semantic_base_{len(query_embedding)}"
|
||||||
|
|
||||||
|
logger.info(f"Selected ranking profile: {ranking_profile}")
|
||||||
|
|
||||||
logger.debug(f"Query YQL: {yql}")
|
logger.debug(f"Query YQL: {yql}")
|
||||||
|
|
||||||
params: dict[str, str | int | float] = {
|
params: dict[str, str | int | float] = {
|
||||||
|
@ -289,15 +289,21 @@ Rephrased query for search engine:
|
|||||||
|
|
||||||
|
|
||||||
QUERY_KEYWORD_EXPANSION_WITHOUT_HISTORY_PROMPT = """
|
QUERY_KEYWORD_EXPANSION_WITHOUT_HISTORY_PROMPT = """
|
||||||
Please rephrase the following user question as a keyword query that would be appropriate for a \
|
Please rephrase the following user question as a pure keyword query that would be appropriate for a \
|
||||||
search engine.
|
search engine. IMPORTANT: the rephrased query MUST ONLY use EXISTING KEYWORDS from the original query \
|
||||||
|
(exception: critical verbs that are converted to nouns)!
|
||||||
|
Also, keywords are usually nouns or adjectives, so you will likely need to drop \
|
||||||
|
any verbs. IF AND ONLY IF you really think that a verb would be critical to FINDING the document, \
|
||||||
|
convert the verb to a noun. \
|
||||||
|
This will be rare though. Verbs like 'find, summarize, describe, etc. would NOT fall into this category, \
|
||||||
|
for example, and should be omitted from the rephrased keyword query.
|
||||||
|
|
||||||
Here is the user question:
|
Here is the user question:
|
||||||
{question}
|
{question}
|
||||||
|
|
||||||
Respond with EXACTLY and ONLY one rephrased query.
|
Respond with EXACTLY and ONLY one rephrased keyword query.
|
||||||
|
|
||||||
Rephrased query for search engine:
|
Rephrased keyword query for search engine:
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user