Update connector interface with optional Owners information (#798)

This commit is contained in:
Yuhong Sun 2023-11-30 23:08:16 -08:00 committed by GitHub
parent 17c2f06338
commit 8020db9e9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 82 additions and 73 deletions

View File

@ -1,11 +1,11 @@
"""Experimental functionality related to splitting up indexing
into a series of checkpoints to better handle intermmittent failures
into a series of checkpoints to better handle intermittent failures
/ jobs being killed by cloud providers."""
import datetime
from danswer.configs.app_configs import EXPERIMENTAL_CHECKPOINTING_ENABLED
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.time_utils import datetime_to_utc
from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
def _2010_dt() -> datetime.datetime:

View File

@ -18,6 +18,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
@ -294,7 +295,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
source=DocumentSource.CONFLUENCE,
semantic_identifier=page["title"],
doc_updated_at=last_modified,
primary_owners=[author] if author else None,
primary_owners=[BasicExpertInfo(email=author)]
if author
else None,
metadata={
"Wiki Space Name": self.space,
},

View File

@ -0,0 +1,45 @@
from datetime import datetime
from datetime import timezone
from dateutil.parser import parse
from danswer.connectors.models import BasicExpertInfo
from danswer.utils.text_processing import is_valid_email
def datetime_to_utc(dt: datetime) -> datetime:
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def time_str_to_utc(datetime_str: str) -> datetime:
dt = parse(datetime_str)
return datetime_to_utc(dt)
def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
if info.first_name and info.last_name:
return f"{info.first_name} {info.middle_initial} {info.last_name}"
if info.display_name:
return info.display_name
if info.email and is_valid_email(info.email):
return info.email
if info.first_name:
return info.first_name
return None
def get_experts_stores_representations(
experts: list[BasicExpertInfo] | None,
) -> list[str] | None:
if not experts:
return None
reps = [basic_expert_info_representation(owner) for owner in experts]
return [owner for owner in reps if owner is not None]

View File

@ -1,16 +0,0 @@
from datetime import datetime
from datetime import timezone
from dateutil.parser import parse
def datetime_to_utc(dt: datetime) -> datetime:
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def time_str_to_utc(datetime_str: str) -> datetime:
dt = parse(datetime_str)
return datetime_to_utc(dt)

View File

@ -12,7 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.file.utils import check_file_ext_is_valid
from danswer.connectors.file.utils import get_file_ext
from danswer.connectors.interfaces import GenerateDocumentsOutput

View File

@ -8,7 +8,7 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector

View File

@ -27,6 +27,22 @@ class Section(BaseModel):
link: str | None
class BasicExpertInfo(BaseModel):
"""Basic Information for the owner of a document, any of the fields can be left as None
Display fallback goes as follows:
- first_name + (optional middle_initial) + last_name
- display_name
- email
- first_name
"""
display_name: str | None = None
first_name: str | None = None
middle_initial: str | None = None
last_name: str | None = None
email: str | None = None
class DocumentBase(BaseModel):
"""Used for Danswer ingestion api, the ID is inferred before use if not provided"""
@ -38,9 +54,9 @@ class DocumentBase(BaseModel):
# UTC time
doc_updated_at: datetime | None = None
# Owner, creator, etc.
primary_owners: list[str] | None = None
primary_owners: list[BasicExpertInfo] | None = None
# Assignee, space owner, etc.
secondary_owners: list[str] | None = None
secondary_owners: list[BasicExpertInfo] | None = None
# `title` is used when computing best matches for a query
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
title: str | None = None

View File

@ -10,7 +10,7 @@ from retry import retry
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch

View File

@ -23,7 +23,6 @@ from danswer.direct_qa.interfaces import StreamingError
from danswer.direct_qa.models import LLMMetricsContainer
from danswer.direct_qa.qa_utils import get_chunks_for_qa
from danswer.document_index.factory import get_default_document_index
from danswer.expert_recommendation.heuristics_based import extract_experts
from danswer.indexing.models import InferenceChunk
from danswer.search.models import QueryFlow
from danswer.search.models import RerankMetricsContainer
@ -34,7 +33,6 @@ from danswer.search.search_runner import chunks_to_search_docs
from danswer.search.search_runner import full_chunk_search
from danswer.search.search_runner import full_chunk_search_generator
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
from danswer.server.models import ExpertsResponse
from danswer.server.models import LLMRelevanceFilterResponse
from danswer.server.models import NewMessageRequest
from danswer.server.models import QADocsResponse
@ -244,10 +242,6 @@ def answer_qa_query_stream(
# immediately see some results
top_chunks = cast(list[InferenceChunk], next(search_generator))
expert_emails = extract_experts(top_chunks)
expert_response = ExpertsResponse(experts=expert_emails).dict()
yield get_json_line(expert_response)
top_docs = chunks_to_search_docs(top_chunks)
initial_response = QADocsResponse(
top_documents=top_docs,

View File

@ -47,6 +47,9 @@ from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from danswer.document_index.document_index_utils import get_uuid_from_chunk
from danswer.document_index.interfaces import DocumentIndex
from danswer.document_index.interfaces import DocumentInsertionRecord
@ -240,8 +243,8 @@ def _index_vespa_chunk(chunk: DocMetadataAwareIndexChunk) -> None:
EMBEDDINGS: embeddings_name_vector_map,
BOOST: DEFAULT_BOOST,
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
PRIMARY_OWNERS: document.primary_owners,
SECONDARY_OWNERS: document.secondary_owners,
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
# the only `set` vespa has is `weightedset`, so we have to give each
# element an arbitrary weight
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},

View File

@ -1,35 +0,0 @@
from collections import defaultdict
from danswer.indexing.models import InferenceChunk
from danswer.utils.text_processing import is_valid_email
# What is the minimum cumulative score for a user to be considered an Expert
# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
_EXPERT_SCORE_RATIO = 2.5 / 50
# How much should a score be discounted if the user is not the primary owner
_SECONDARY_OWNER_DISCOUNT = 0.5
def extract_experts(
chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
) -> list[str]:
target_score = score_ratio * len(chunks)
expert_scores: dict[str, float] = defaultdict(float)
for chunk in chunks:
if chunk.primary_owners:
for p_owner in chunk.primary_owners:
if chunk.score:
expert_scores[p_owner] += chunk.score
if chunk.secondary_owners:
for s_owner in chunk.secondary_owners:
if chunk.score:
expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
return [
owner
for owner, score in expert_scores.items()
if score >= target_score and is_valid_email(owner)
]

View File

@ -5,6 +5,9 @@ from typing import Protocol
from sqlalchemy.orm import Session
from danswer.access.access import get_access_for_documents
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from danswer.connectors.models import Document
from danswer.connectors.models import IndexAttemptMetadata
from danswer.db.document import get_documents_by_ids
@ -50,8 +53,8 @@ def upsert_documents_in_db(
document_id=doc.id,
semantic_identifier=doc.semantic_identifier,
first_link=first_link,
primary_owners=doc.primary_owners,
secondary_owners=doc.secondary_owners,
primary_owners=get_experts_stores_representations(doc.primary_owners),
secondary_owners=get_experts_stores_representations(doc.secondary_owners),
from_ingestion_api=doc.from_ingestion_api,
)
doc_m_batch.append(db_doc_metadata)

View File

@ -202,10 +202,6 @@ class SearchFeedbackRequest(BaseModel):
search_feedback: SearchFeedbackType
class ExpertsResponse(BaseModel):
experts: list[str]
class RetrievalDocs(BaseModel):
top_documents: list[SearchDoc]