mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-02 19:19:30 +02:00
Expert Recommendation Heuristic Only (#791)
This commit is contained in:
parent
006fd4c438
commit
fda89ac810
@ -23,6 +23,7 @@ from danswer.direct_qa.interfaces import StreamingError
|
||||
from danswer.direct_qa.models import LLMMetricsContainer
|
||||
from danswer.direct_qa.qa_utils import get_chunks_for_qa
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.expert_recommendation.heuristics_based import extract_experts
|
||||
from danswer.indexing.models import InferenceChunk
|
||||
from danswer.search.models import QueryFlow
|
||||
from danswer.search.models import RerankMetricsContainer
|
||||
@ -33,6 +34,7 @@ from danswer.search.search_runner import chunks_to_search_docs
|
||||
from danswer.search.search_runner import full_chunk_search
|
||||
from danswer.search.search_runner import full_chunk_search_generator
|
||||
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
|
||||
from danswer.server.models import ExpertsResponse
|
||||
from danswer.server.models import LLMRelevanceFilterResponse
|
||||
from danswer.server.models import NewMessageRequest
|
||||
from danswer.server.models import QADocsResponse
|
||||
@ -241,6 +243,11 @@ def answer_qa_query_stream(
|
||||
# first fetch and return to the UI the top chunks so the user can
|
||||
# immediately see some results
|
||||
top_chunks = cast(list[InferenceChunk], next(search_generator))
|
||||
|
||||
expert_emails = extract_experts(top_chunks)
|
||||
expert_response = ExpertsResponse(experts=expert_emails).dict()
|
||||
yield get_json_line(expert_response)
|
||||
|
||||
top_docs = chunks_to_search_docs(top_chunks)
|
||||
initial_response = QADocsResponse(
|
||||
top_documents=top_docs,
|
||||
|
@ -500,6 +500,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
recency_bias=fields["matchfeatures"][RECENCY_BIAS],
|
||||
score=hit["relevance"],
|
||||
hidden=fields.get(HIDDEN, False),
|
||||
primary_owners=fields.get(PRIMARY_OWNERS),
|
||||
secondary_owners=fields.get(SECONDARY_OWNERS),
|
||||
metadata=metadata,
|
||||
match_highlights=match_highlights,
|
||||
updated_at=updated_at,
|
||||
@ -558,6 +560,8 @@ class VespaIndex(DocumentIndex):
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{DOC_UPDATED_AT}, "
|
||||
f"{PRIMARY_OWNERS}, "
|
||||
f"{SECONDARY_OWNERS}, "
|
||||
f"{METADATA}, "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {DOCUMENT_INDEX_NAME} where "
|
||||
|
0
backend/danswer/expert_recommendation/__init__.py
Normal file
0
backend/danswer/expert_recommendation/__init__.py
Normal file
35
backend/danswer/expert_recommendation/heuristics_based.py
Normal file
35
backend/danswer/expert_recommendation/heuristics_based.py
Normal file
@ -0,0 +1,35 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from danswer.indexing.models import InferenceChunk
|
||||
from danswer.utils.text_processing import is_valid_email
|
||||
|
||||
# What is the minimum cumulative score for a user to be considered an Expert
|
||||
# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
|
||||
_EXPERT_SCORE_RATIO = 2.5 / 50
|
||||
# How much should a score be discounted if the user is not the primary owner
|
||||
_SECONDARY_OWNER_DISCOUNT = 0.5
|
||||
|
||||
|
||||
def extract_experts(
|
||||
chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
|
||||
) -> list[str]:
|
||||
target_score = score_ratio * len(chunks)
|
||||
|
||||
expert_scores: dict[str, float] = defaultdict(float)
|
||||
|
||||
for chunk in chunks:
|
||||
if chunk.primary_owners:
|
||||
for p_owner in chunk.primary_owners:
|
||||
if chunk.score:
|
||||
expert_scores[p_owner] += chunk.score
|
||||
|
||||
if chunk.secondary_owners:
|
||||
for s_owner in chunk.secondary_owners:
|
||||
if chunk.score:
|
||||
expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
|
||||
|
||||
return [
|
||||
owner
|
||||
for owner, score in expert_scores.items()
|
||||
if score >= target_score and is_valid_email(owner)
|
||||
]
|
@ -93,6 +93,8 @@ class InferenceChunk(BaseChunk):
|
||||
match_highlights: list[str]
|
||||
# when the doc was last updated
|
||||
updated_at: datetime | None
|
||||
primary_owners: list[str] | None = None
|
||||
secondary_owners: list[str] | None = None
|
||||
|
||||
@property
|
||||
def unique_id(self) -> str:
|
||||
|
@ -202,6 +202,10 @@ class SearchFeedbackRequest(BaseModel):
|
||||
search_feedback: SearchFeedbackType
|
||||
|
||||
|
||||
class ExpertsResponse(BaseModel):
|
||||
experts: list[str]
|
||||
|
||||
|
||||
class RetrievalDocs(BaseModel):
|
||||
top_documents: list[SearchDoc]
|
||||
|
||||
|
@ -60,3 +60,13 @@ def shared_precompare_cleanup(text: str) -> str:
|
||||
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def is_valid_email(text: str) -> bool:
|
||||
"""Can use a library instead if more detailed checks are needed"""
|
||||
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
|
||||
|
||||
if re.match(regex, text):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
Loading…
x
Reference in New Issue
Block a user