diff --git a/backend/danswer/direct_qa/answer_question.py b/backend/danswer/direct_qa/answer_question.py index 6c8af0b131..18736b8db1 100644 --- a/backend/danswer/direct_qa/answer_question.py +++ b/backend/danswer/direct_qa/answer_question.py @@ -23,6 +23,7 @@ from danswer.direct_qa.interfaces import StreamingError from danswer.direct_qa.models import LLMMetricsContainer from danswer.direct_qa.qa_utils import get_chunks_for_qa from danswer.document_index.factory import get_default_document_index +from danswer.expert_recommendation.heuristics_based import extract_experts from danswer.indexing.models import InferenceChunk from danswer.search.models import QueryFlow from danswer.search.models import RerankMetricsContainer @@ -33,6 +34,7 @@ from danswer.search.search_runner import chunks_to_search_docs from danswer.search.search_runner import full_chunk_search from danswer.search.search_runner import full_chunk_search_generator from danswer.secondary_llm_flows.answer_validation import get_answer_validity +from danswer.server.models import ExpertsResponse from danswer.server.models import LLMRelevanceFilterResponse from danswer.server.models import NewMessageRequest from danswer.server.models import QADocsResponse @@ -241,6 +243,11 @@ def answer_qa_query_stream( # first fetch and return to the UI the top chunks so the user can # immediately see some results top_chunks = cast(list[InferenceChunk], next(search_generator)) + + expert_emails = extract_experts(top_chunks) + expert_response = ExpertsResponse(experts=expert_emails).dict() + yield get_json_line(expert_response) + top_docs = chunks_to_search_docs(top_chunks) initial_response = QADocsResponse( top_documents=top_docs, diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index d82c59eb6c..54039acdd8 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -500,6 +500,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk: recency_bias=fields["matchfeatures"][RECENCY_BIAS], score=hit["relevance"], hidden=fields.get(HIDDEN, False), + primary_owners=fields.get(PRIMARY_OWNERS), + secondary_owners=fields.get(SECONDARY_OWNERS), metadata=metadata, match_highlights=match_highlights, updated_at=updated_at, @@ -558,6 +560,8 @@ class VespaIndex(DocumentIndex): f"{BOOST}, " f"{HIDDEN}, " f"{DOC_UPDATED_AT}, " + f"{PRIMARY_OWNERS}, " + f"{SECONDARY_OWNERS}, " f"{METADATA}, " f"{CONTENT_SUMMARY} " f"from {DOCUMENT_INDEX_NAME} where " diff --git a/backend/danswer/expert_recommendation/__init__.py b/backend/danswer/expert_recommendation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/danswer/expert_recommendation/heuristics_based.py b/backend/danswer/expert_recommendation/heuristics_based.py new file mode 100644 index 0000000000..9f2bdb4b2d --- /dev/null +++ b/backend/danswer/expert_recommendation/heuristics_based.py @@ -0,0 +1,35 @@ +from collections import defaultdict + +from danswer.indexing.models import InferenceChunk +from danswer.utils.text_processing import is_valid_email + +# What is the minimum cumulative score for a user to be considered an Expert +# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert +_EXPERT_SCORE_RATIO = 2.5 / 50 +# How much should a score be discounted if the user is not the primary owner +_SECONDARY_OWNER_DISCOUNT = 0.5 + + +def extract_experts( + chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO +) -> list[str]: + target_score = score_ratio * len(chunks) + + expert_scores: dict[str, float] = defaultdict(float) + + for chunk in chunks: + if chunk.primary_owners: + for p_owner in chunk.primary_owners: + if chunk.score: + expert_scores[p_owner] += chunk.score + + if chunk.secondary_owners: + for s_owner in chunk.secondary_owners: + if chunk.score: + expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score + + return [ + owner + for owner, score in expert_scores.items() + if score >= target_score and is_valid_email(owner) + ] diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py index 8c9bc0e60c..c879ea8cd9 100644 --- a/backend/danswer/indexing/models.py +++ b/backend/danswer/indexing/models.py @@ -93,6 +93,8 @@ class InferenceChunk(BaseChunk): match_highlights: list[str] # when the doc was last updated updated_at: datetime | None + primary_owners: list[str] | None = None + secondary_owners: list[str] | None = None @property def unique_id(self) -> str: diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index d4bf75ee66..5e53453e7c 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -202,6 +202,10 @@ class SearchFeedbackRequest(BaseModel): search_feedback: SearchFeedbackType +class ExpertsResponse(BaseModel): + experts: list[str] + + class RetrievalDocs(BaseModel): top_documents: list[SearchDoc] diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index e1ea34c17d..dcbbb6c5f7 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -60,3 +60,13 @@ def shared_precompare_cleanup(text: str) -> str: text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text) return text + + +def is_valid_email(text: str) -> bool: + """Can use a library instead if more detailed checks are needed""" + regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" + + if re.match(regex, text): + return True + else: + return False