diff --git a/backend/danswer/background/indexing/checkpointing.py b/backend/danswer/background/indexing/checkpointing.py index 673540b0e..d1d4c8cf4 100644 --- a/backend/danswer/background/indexing/checkpointing.py +++ b/backend/danswer/background/indexing/checkpointing.py @@ -1,11 +1,11 @@ """Experimental functionality related to splitting up indexing -into a series of checkpoints to better handle intermmittent failures +into a series of checkpoints to better handle intermittent failures / jobs being killed by cloud providers.""" import datetime from danswer.configs.app_configs import EXPERIMENTAL_CHECKPOINTING_ENABLED from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.time_utils import datetime_to_utc +from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc def _2010_dt() -> datetime.datetime: diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 4f017f036..2438fa9d6 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -18,6 +18,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section @@ -294,7 +295,9 @@ class ConfluenceConnector(LoadConnector, PollConnector): source=DocumentSource.CONFLUENCE, semantic_identifier=page["title"], doc_updated_at=last_modified, - primary_owners=[author] if author else None, + primary_owners=[BasicExpertInfo(email=author)] + if author + else None, metadata={ "Wiki Space Name": self.space, }, diff --git a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py new file mode 100644 index 000000000..10c831560 --- /dev/null +++ b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py @@ -0,0 +1,45 @@ +from datetime import datetime +from datetime import timezone + +from dateutil.parser import parse + +from danswer.connectors.models import BasicExpertInfo +from danswer.utils.text_processing import is_valid_email + + +def datetime_to_utc(dt: datetime) -> datetime: + if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: + dt = dt.replace(tzinfo=timezone.utc) + + return dt.astimezone(timezone.utc) + + +def time_str_to_utc(datetime_str: str) -> datetime: + dt = parse(datetime_str) + return datetime_to_utc(dt) + + +def basic_expert_info_representation(info: BasicExpertInfo) -> str | None: + if info.first_name and info.last_name: + return f"{info.first_name} {info.middle_initial} {info.last_name}" + + if info.display_name: + return info.display_name + + if info.email and is_valid_email(info.email): + return info.email + + if info.first_name: + return info.first_name + + return None + + +def get_experts_stores_representations( + experts: list[BasicExpertInfo] | None, +) -> list[str] | None: + if not experts: + return None + + reps = [basic_expert_info_representation(owner) for owner in experts] + return [owner for owner in reps if owner is not None] diff --git a/backend/danswer/connectors/cross_connector_utils/time_utils.py b/backend/danswer/connectors/cross_connector_utils/time_utils.py deleted file mode 100644 index bab6d2e1f..000000000 --- a/backend/danswer/connectors/cross_connector_utils/time_utils.py +++ /dev/null @@ -1,16 +0,0 @@ -from datetime import datetime -from datetime import timezone - -from dateutil.parser import parse - - -def datetime_to_utc(dt: datetime) -> datetime: - if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: - dt = dt.replace(tzinfo=timezone.utc) - - return dt.astimezone(timezone.utc) - - -def time_str_to_utc(datetime_str: str) -> datetime: - dt = parse(datetime_str) - return datetime_to_utc(dt) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 8d73fe95e..fe570d55d 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -12,7 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import detect_encoding from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip from danswer.connectors.cross_connector_utils.file_utils import read_file from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file -from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.file.utils import check_file_ext_is_valid from danswer.connectors.file.utils import get_file_ext from danswer.connectors.interfaces import GenerateDocumentsOutput diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index d60ccc50c..dff2c366e 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -8,7 +8,7 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic -from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 4a9eac0d2..871382f49 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -27,6 +27,22 @@ class Section(BaseModel): link: str | None +class BasicExpertInfo(BaseModel): + """Basic Information for the owner of a document, any of the fields can be left as None + Display fallback goes as follows: + - first_name + (optional middle_initial) + last_name + - display_name + - email + - first_name + """ + + display_name: str | None = None + first_name: str | None = None + middle_initial: str | None = None + last_name: str | None = None + email: str | None = None + + class DocumentBase(BaseModel): """Used for Danswer ingestion api, the ID is inferred before use if not provided""" @@ -38,9 +54,9 @@ class DocumentBase(BaseModel): # UTC time doc_updated_at: datetime | None = None # Owner, creator, etc. - primary_owners: list[str] | None = None + primary_owners: list[BasicExpertInfo] | None = None # Assignee, space owner, etc. - secondary_owners: list[str] | None = None + secondary_owners: list[BasicExpertInfo] | None = None # `title` is used when computing best matches for a query # if `None`, then we will use the `semantic_identifier` as the title in Vespa title: str | None = None diff --git a/backend/danswer/connectors/productboard/connector.py b/backend/danswer/connectors/productboard/connector.py index 14a54c1ac..c5003951b 100644 --- a/backend/danswer/connectors/productboard/connector.py +++ b/backend/danswer/connectors/productboard/connector.py @@ -10,7 +10,7 @@ from retry import retry from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch diff --git a/backend/danswer/direct_qa/answer_question.py b/backend/danswer/direct_qa/answer_question.py index 18736b8db..adf62ceb6 100644 --- a/backend/danswer/direct_qa/answer_question.py +++ b/backend/danswer/direct_qa/answer_question.py @@ -23,7 +23,6 @@ from danswer.direct_qa.interfaces import StreamingError from danswer.direct_qa.models import LLMMetricsContainer from danswer.direct_qa.qa_utils import get_chunks_for_qa from danswer.document_index.factory import get_default_document_index -from danswer.expert_recommendation.heuristics_based import extract_experts from danswer.indexing.models import InferenceChunk from danswer.search.models import QueryFlow from danswer.search.models import RerankMetricsContainer @@ -34,7 +33,6 @@ from danswer.search.search_runner import chunks_to_search_docs from danswer.search.search_runner import full_chunk_search from danswer.search.search_runner import full_chunk_search_generator from danswer.secondary_llm_flows.answer_validation import get_answer_validity -from danswer.server.models import ExpertsResponse from danswer.server.models import LLMRelevanceFilterResponse from danswer.server.models import NewMessageRequest from danswer.server.models import QADocsResponse @@ -244,10 +242,6 @@ def answer_qa_query_stream( # immediately see some results top_chunks = cast(list[InferenceChunk], next(search_generator)) - expert_emails = extract_experts(top_chunks) - expert_response = ExpertsResponse(experts=expert_emails).dict() - yield get_json_line(expert_response) - top_docs = chunks_to_search_docs(top_chunks) initial_response = QADocsResponse( top_documents=top_docs, diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 54039acdd..e4e299220 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -47,6 +47,9 @@ from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE from danswer.configs.constants import TITLE from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF +from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( + get_experts_stores_representations, +) from danswer.document_index.document_index_utils import get_uuid_from_chunk from danswer.document_index.interfaces import DocumentIndex from danswer.document_index.interfaces import DocumentInsertionRecord @@ -240,8 +243,8 @@ def _index_vespa_chunk(chunk: DocMetadataAwareIndexChunk) -> None: EMBEDDINGS: embeddings_name_vector_map, BOOST: DEFAULT_BOOST, DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at), - PRIMARY_OWNERS: document.primary_owners, - SECONDARY_OWNERS: document.secondary_owners, + PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners), + SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners), # the only `set` vespa has is `weightedset`, so we have to give each # element an arbitrary weight ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()}, diff --git a/backend/danswer/expert_recommendation/__init__.py b/backend/danswer/expert_recommendation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/backend/danswer/expert_recommendation/heuristics_based.py b/backend/danswer/expert_recommendation/heuristics_based.py deleted file mode 100644 index 9f2bdb4b2..000000000 --- a/backend/danswer/expert_recommendation/heuristics_based.py +++ /dev/null @@ -1,35 +0,0 @@ -from collections import defaultdict - -from danswer.indexing.models import InferenceChunk -from danswer.utils.text_processing import is_valid_email - -# What is the minimum cumulative score for a user to be considered an Expert -# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert -_EXPERT_SCORE_RATIO = 2.5 / 50 -# How much should a score be discounted if the user is not the primary owner -_SECONDARY_OWNER_DISCOUNT = 0.5 - - -def extract_experts( - chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO -) -> list[str]: - target_score = score_ratio * len(chunks) - - expert_scores: dict[str, float] = defaultdict(float) - - for chunk in chunks: - if chunk.primary_owners: - for p_owner in chunk.primary_owners: - if chunk.score: - expert_scores[p_owner] += chunk.score - - if chunk.secondary_owners: - for s_owner in chunk.secondary_owners: - if chunk.score: - expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score - - return [ - owner - for owner, score in expert_scores.items() - if score >= target_score and is_valid_email(owner) - ] diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index f94e637e1..dbf13bc80 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -5,6 +5,9 @@ from typing import Protocol from sqlalchemy.orm import Session from danswer.access.access import get_access_for_documents +from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( + get_experts_stores_representations, +) from danswer.connectors.models import Document from danswer.connectors.models import IndexAttemptMetadata from danswer.db.document import get_documents_by_ids @@ -50,8 +53,8 @@ def upsert_documents_in_db( document_id=doc.id, semantic_identifier=doc.semantic_identifier, first_link=first_link, - primary_owners=doc.primary_owners, - secondary_owners=doc.secondary_owners, + primary_owners=get_experts_stores_representations(doc.primary_owners), + secondary_owners=get_experts_stores_representations(doc.secondary_owners), from_ingestion_api=doc.from_ingestion_api, ) doc_m_batch.append(db_doc_metadata) diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index 5e53453e7..d4bf75ee6 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -202,10 +202,6 @@ class SearchFeedbackRequest(BaseModel): search_feedback: SearchFeedbackType -class ExpertsResponse(BaseModel): - experts: list[str] - - class RetrievalDocs(BaseModel): top_documents: list[SearchDoc]