Expert Recommendation Heuristic Only (#791)

2025-09-26 03:48:49 +02:00 · 2023-11-29 15:53:57 -08:00
parent 006fd4c438
commit fda89ac810
7 changed files with 62 additions and 0 deletions
--- a/backend/danswer/direct_qa/answer_question.py
+++ b/backend/danswer/direct_qa/answer_question.py
@@ -23,6 +23,7 @@ from danswer.direct_qa.interfaces import StreamingError
 from danswer.direct_qa.models import LLMMetricsContainer
 from danswer.direct_qa.qa_utils import get_chunks_for_qa
 from danswer.document_index.factory import get_default_document_index
+from danswer.expert_recommendation.heuristics_based import extract_experts
 from danswer.indexing.models import InferenceChunk
 from danswer.search.models import QueryFlow
 from danswer.search.models import RerankMetricsContainer
@@ -33,6 +34,7 @@ from danswer.search.search_runner import chunks_to_search_docs
 from danswer.search.search_runner import full_chunk_search
 from danswer.search.search_runner import full_chunk_search_generator
 from danswer.secondary_llm_flows.answer_validation import get_answer_validity
+from danswer.server.models import ExpertsResponse
 from danswer.server.models import LLMRelevanceFilterResponse
 from danswer.server.models import NewMessageRequest
 from danswer.server.models import QADocsResponse
@@ -241,6 +243,11 @@ def answer_qa_query_stream(
    # first fetch and return to the UI the top chunks so the user can
    # immediately see some results
    top_chunks = cast(list[InferenceChunk], next(search_generator))
+
+    expert_emails = extract_experts(top_chunks)
+    expert_response = ExpertsResponse(experts=expert_emails).dict()
+    yield get_json_line(expert_response)
+
    top_docs = chunks_to_search_docs(top_chunks)
    initial_response = QADocsResponse(
        top_documents=top_docs,
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -500,6 +500,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
        recency_bias=fields["matchfeatures"][RECENCY_BIAS],
        score=hit["relevance"],
        hidden=fields.get(HIDDEN, False),
+        primary_owners=fields.get(PRIMARY_OWNERS),
+        secondary_owners=fields.get(SECONDARY_OWNERS),
        metadata=metadata,
        match_highlights=match_highlights,
        updated_at=updated_at,
@@ -558,6 +560,8 @@ class VespaIndex(DocumentIndex):
        f"{BOOST}, "
        f"{HIDDEN}, "
        f"{DOC_UPDATED_AT}, "
+        f"{PRIMARY_OWNERS}, "
+        f"{SECONDARY_OWNERS}, "
        f"{METADATA}, "
        f"{CONTENT_SUMMARY} "
        f"from {DOCUMENT_INDEX_NAME} where "
--- a/backend/danswer/expert_recommendation/init.py
+++ b/backend/danswer/expert_recommendation/init.py
--- a/backend/danswer/expert_recommendation/heuristics_based.py
+++ b/backend/danswer/expert_recommendation/heuristics_based.py
@@ -0,0 +1,35 @@
+from collections import defaultdict
+
+from danswer.indexing.models import InferenceChunk
+from danswer.utils.text_processing import is_valid_email
+
+# What is the minimum cumulative score for a user to be considered an Expert
+# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
+_EXPERT_SCORE_RATIO = 2.5 / 50
+# How much should a score be discounted if the user is not the primary owner
+_SECONDARY_OWNER_DISCOUNT = 0.5
+
+
+def extract_experts(
+    chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
+) -> list[str]:
+    target_score = score_ratio * len(chunks)
+
+    expert_scores: dict[str, float] = defaultdict(float)
+
+    for chunk in chunks:
+        if chunk.primary_owners:
+            for p_owner in chunk.primary_owners:
+                if chunk.score:
+                    expert_scores[p_owner] += chunk.score
+
+        if chunk.secondary_owners:
+            for s_owner in chunk.secondary_owners:
+                if chunk.score:
+                    expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
+
+    return [
+        owner
+        for owner, score in expert_scores.items()
+        if score >= target_score and is_valid_email(owner)
+    ]
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@@ -93,6 +93,8 @@ class InferenceChunk(BaseChunk):
    match_highlights: list[str]
    # when the doc was last updated
    updated_at: datetime | None
+    primary_owners: list[str] | None = None
+    secondary_owners: list[str] | None = None

    @property
    def unique_id(self) -> str:
--- a/backend/danswer/server/models.py
+++ b/backend/danswer/server/models.py
@@ -202,6 +202,10 @@ class SearchFeedbackRequest(BaseModel):
    search_feedback: SearchFeedbackType


+class ExpertsResponse(BaseModel):
+    experts: list[str]
+
+
 class RetrievalDocs(BaseModel):
    top_documents: list[SearchDoc]

--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -60,3 +60,13 @@ def shared_precompare_cleanup(text: str) -> str:
    text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)

    return text
+
+
+def is_valid_email(text: str) -> bool:
+    """Can use a library instead if more detailed checks are needed"""
+    regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
+
+    if re.match(regex, text):
+        return True
+    else:
+        return False