Expert Recommendation Heuristic Only (#791)

This commit is contained in:
Yuhong Sun 2023-11-29 15:53:57 -08:00 committed by GitHub
parent 006fd4c438
commit fda89ac810
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 62 additions and 0 deletions

View File

@ -23,6 +23,7 @@ from danswer.direct_qa.interfaces import StreamingError
from danswer.direct_qa.models import LLMMetricsContainer
from danswer.direct_qa.qa_utils import get_chunks_for_qa
from danswer.document_index.factory import get_default_document_index
from danswer.expert_recommendation.heuristics_based import extract_experts
from danswer.indexing.models import InferenceChunk
from danswer.search.models import QueryFlow
from danswer.search.models import RerankMetricsContainer
@ -33,6 +34,7 @@ from danswer.search.search_runner import chunks_to_search_docs
from danswer.search.search_runner import full_chunk_search
from danswer.search.search_runner import full_chunk_search_generator
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
from danswer.server.models import ExpertsResponse
from danswer.server.models import LLMRelevanceFilterResponse
from danswer.server.models import NewMessageRequest
from danswer.server.models import QADocsResponse
@ -241,6 +243,11 @@ def answer_qa_query_stream(
# first fetch and return to the UI the top chunks so the user can
# immediately see some results
top_chunks = cast(list[InferenceChunk], next(search_generator))
expert_emails = extract_experts(top_chunks)
expert_response = ExpertsResponse(experts=expert_emails).dict()
yield get_json_line(expert_response)
top_docs = chunks_to_search_docs(top_chunks)
initial_response = QADocsResponse(
top_documents=top_docs,

View File

@ -500,6 +500,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
recency_bias=fields["matchfeatures"][RECENCY_BIAS],
score=hit["relevance"],
hidden=fields.get(HIDDEN, False),
primary_owners=fields.get(PRIMARY_OWNERS),
secondary_owners=fields.get(SECONDARY_OWNERS),
metadata=metadata,
match_highlights=match_highlights,
updated_at=updated_at,
@ -558,6 +560,8 @@ class VespaIndex(DocumentIndex):
f"{BOOST}, "
f"{HIDDEN}, "
f"{DOC_UPDATED_AT}, "
f"{PRIMARY_OWNERS}, "
f"{SECONDARY_OWNERS}, "
f"{METADATA}, "
f"{CONTENT_SUMMARY} "
f"from {DOCUMENT_INDEX_NAME} where "

View File

@ -0,0 +1,35 @@
from collections import defaultdict
from danswer.indexing.models import InferenceChunk
from danswer.utils.text_processing import is_valid_email
# What is the minimum cumulative score for a user to be considered an Expert
# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
_EXPERT_SCORE_RATIO = 2.5 / 50
# How much should a score be discounted if the user is not the primary owner
_SECONDARY_OWNER_DISCOUNT = 0.5
def extract_experts(
chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
) -> list[str]:
target_score = score_ratio * len(chunks)
expert_scores: dict[str, float] = defaultdict(float)
for chunk in chunks:
if chunk.primary_owners:
for p_owner in chunk.primary_owners:
if chunk.score:
expert_scores[p_owner] += chunk.score
if chunk.secondary_owners:
for s_owner in chunk.secondary_owners:
if chunk.score:
expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
return [
owner
for owner, score in expert_scores.items()
if score >= target_score and is_valid_email(owner)
]

View File

@ -93,6 +93,8 @@ class InferenceChunk(BaseChunk):
match_highlights: list[str]
# when the doc was last updated
updated_at: datetime | None
primary_owners: list[str] | None = None
secondary_owners: list[str] | None = None
@property
def unique_id(self) -> str:

View File

@ -202,6 +202,10 @@ class SearchFeedbackRequest(BaseModel):
search_feedback: SearchFeedbackType
class ExpertsResponse(BaseModel):
experts: list[str]
class RetrievalDocs(BaseModel):
top_documents: list[SearchDoc]

View File

@ -60,3 +60,13 @@ def shared_precompare_cleanup(text: str) -> str:
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
return text
def is_valid_email(text: str) -> bool:
"""Can use a library instead if more detailed checks are needed"""
regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
if re.match(regex, text):
return True
else:
return False