mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-07 19:38:19 +02:00
Update connector interface with optional Owners information (#798)
This commit is contained in:
parent
17c2f06338
commit
8020db9e9a
@ -1,11 +1,11 @@
|
||||
"""Experimental functionality related to splitting up indexing
|
||||
into a series of checkpoints to better handle intermmittent failures
|
||||
into a series of checkpoints to better handle intermittent failures
|
||||
/ jobs being killed by cloud providers."""
|
||||
import datetime
|
||||
|
||||
from danswer.configs.app_configs import EXPERIMENTAL_CHECKPOINTING_ENABLED
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.time_utils import datetime_to_utc
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
|
||||
|
||||
|
||||
def _2010_dt() -> datetime.datetime:
|
||||
|
@ -18,6 +18,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
@ -294,7 +295,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
source=DocumentSource.CONFLUENCE,
|
||||
semantic_identifier=page["title"],
|
||||
doc_updated_at=last_modified,
|
||||
primary_owners=[author] if author else None,
|
||||
primary_owners=[BasicExpertInfo(email=author)]
|
||||
if author
|
||||
else None,
|
||||
metadata={
|
||||
"Wiki Space Name": self.space,
|
||||
},
|
||||
|
@ -0,0 +1,45 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from dateutil.parser import parse
|
||||
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.utils.text_processing import is_valid_email
|
||||
|
||||
|
||||
def datetime_to_utc(dt: datetime) -> datetime:
|
||||
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def time_str_to_utc(datetime_str: str) -> datetime:
|
||||
dt = parse(datetime_str)
|
||||
return datetime_to_utc(dt)
|
||||
|
||||
|
||||
def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
|
||||
if info.first_name and info.last_name:
|
||||
return f"{info.first_name} {info.middle_initial} {info.last_name}"
|
||||
|
||||
if info.display_name:
|
||||
return info.display_name
|
||||
|
||||
if info.email and is_valid_email(info.email):
|
||||
return info.email
|
||||
|
||||
if info.first_name:
|
||||
return info.first_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_experts_stores_representations(
|
||||
experts: list[BasicExpertInfo] | None,
|
||||
) -> list[str] | None:
|
||||
if not experts:
|
||||
return None
|
||||
|
||||
reps = [basic_expert_info_representation(owner) for owner in experts]
|
||||
return [owner for owner in reps if owner is not None]
|
@ -1,16 +0,0 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from dateutil.parser import parse
|
||||
|
||||
|
||||
def datetime_to_utc(dt: datetime) -> datetime:
|
||||
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def time_str_to_utc(datetime_str: str) -> datetime:
|
||||
dt = parse(datetime_str)
|
||||
return datetime_to_utc(dt)
|
@ -12,7 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||
from danswer.connectors.file.utils import get_file_ext
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
|
@ -8,7 +8,7 @@ import requests
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
|
@ -27,6 +27,22 @@ class Section(BaseModel):
|
||||
link: str | None
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
"""Basic Information for the owner of a document, any of the fields can be left as None
|
||||
Display fallback goes as follows:
|
||||
- first_name + (optional middle_initial) + last_name
|
||||
- display_name
|
||||
- email
|
||||
- first_name
|
||||
"""
|
||||
|
||||
display_name: str | None = None
|
||||
first_name: str | None = None
|
||||
middle_initial: str | None = None
|
||||
last_name: str | None = None
|
||||
email: str | None = None
|
||||
|
||||
|
||||
class DocumentBase(BaseModel):
|
||||
"""Used for Danswer ingestion api, the ID is inferred before use if not provided"""
|
||||
|
||||
@ -38,9 +54,9 @@ class DocumentBase(BaseModel):
|
||||
# UTC time
|
||||
doc_updated_at: datetime | None = None
|
||||
# Owner, creator, etc.
|
||||
primary_owners: list[str] | None = None
|
||||
primary_owners: list[BasicExpertInfo] | None = None
|
||||
# Assignee, space owner, etc.
|
||||
secondary_owners: list[str] | None = None
|
||||
secondary_owners: list[BasicExpertInfo] | None = None
|
||||
# `title` is used when computing best matches for a query
|
||||
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
||||
title: str | None = None
|
||||
|
@ -10,7 +10,7 @@ from retry import retry
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
|
@ -23,7 +23,6 @@ from danswer.direct_qa.interfaces import StreamingError
|
||||
from danswer.direct_qa.models import LLMMetricsContainer
|
||||
from danswer.direct_qa.qa_utils import get_chunks_for_qa
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.expert_recommendation.heuristics_based import extract_experts
|
||||
from danswer.indexing.models import InferenceChunk
|
||||
from danswer.search.models import QueryFlow
|
||||
from danswer.search.models import RerankMetricsContainer
|
||||
@ -34,7 +33,6 @@ from danswer.search.search_runner import chunks_to_search_docs
|
||||
from danswer.search.search_runner import full_chunk_search
|
||||
from danswer.search.search_runner import full_chunk_search_generator
|
||||
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
|
||||
from danswer.server.models import ExpertsResponse
|
||||
from danswer.server.models import LLMRelevanceFilterResponse
|
||||
from danswer.server.models import NewMessageRequest
|
||||
from danswer.server.models import QADocsResponse
|
||||
@ -244,10 +242,6 @@ def answer_qa_query_stream(
|
||||
# immediately see some results
|
||||
top_chunks = cast(list[InferenceChunk], next(search_generator))
|
||||
|
||||
expert_emails = extract_experts(top_chunks)
|
||||
expert_response = ExpertsResponse(experts=expert_emails).dict()
|
||||
yield get_json_line(expert_response)
|
||||
|
||||
top_docs = chunks_to_search_docs(top_chunks)
|
||||
initial_response = QADocsResponse(
|
||||
top_documents=top_docs,
|
||||
|
@ -47,6 +47,9 @@ from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from danswer.document_index.document_index_utils import get_uuid_from_chunk
|
||||
from danswer.document_index.interfaces import DocumentIndex
|
||||
from danswer.document_index.interfaces import DocumentInsertionRecord
|
||||
@ -240,8 +243,8 @@ def _index_vespa_chunk(chunk: DocMetadataAwareIndexChunk) -> None:
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
BOOST: DEFAULT_BOOST,
|
||||
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
||||
PRIMARY_OWNERS: document.primary_owners,
|
||||
SECONDARY_OWNERS: document.secondary_owners,
|
||||
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
||||
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
|
||||
# the only `set` vespa has is `weightedset`, so we have to give each
|
||||
# element an arbitrary weight
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
|
@ -1,35 +0,0 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from danswer.indexing.models import InferenceChunk
|
||||
from danswer.utils.text_processing import is_valid_email
|
||||
|
||||
# What is the minimum cumulative score for a user to be considered an Expert
|
||||
# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
|
||||
_EXPERT_SCORE_RATIO = 2.5 / 50
|
||||
# How much should a score be discounted if the user is not the primary owner
|
||||
_SECONDARY_OWNER_DISCOUNT = 0.5
|
||||
|
||||
|
||||
def extract_experts(
|
||||
chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
|
||||
) -> list[str]:
|
||||
target_score = score_ratio * len(chunks)
|
||||
|
||||
expert_scores: dict[str, float] = defaultdict(float)
|
||||
|
||||
for chunk in chunks:
|
||||
if chunk.primary_owners:
|
||||
for p_owner in chunk.primary_owners:
|
||||
if chunk.score:
|
||||
expert_scores[p_owner] += chunk.score
|
||||
|
||||
if chunk.secondary_owners:
|
||||
for s_owner in chunk.secondary_owners:
|
||||
if chunk.score:
|
||||
expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
|
||||
|
||||
return [
|
||||
owner
|
||||
for owner, score in expert_scores.items()
|
||||
if score >= target_score and is_valid_email(owner)
|
||||
]
|
@ -5,6 +5,9 @@ from typing import Protocol
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.access import get_access_for_documents
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import IndexAttemptMetadata
|
||||
from danswer.db.document import get_documents_by_ids
|
||||
@ -50,8 +53,8 @@ def upsert_documents_in_db(
|
||||
document_id=doc.id,
|
||||
semantic_identifier=doc.semantic_identifier,
|
||||
first_link=first_link,
|
||||
primary_owners=doc.primary_owners,
|
||||
secondary_owners=doc.secondary_owners,
|
||||
primary_owners=get_experts_stores_representations(doc.primary_owners),
|
||||
secondary_owners=get_experts_stores_representations(doc.secondary_owners),
|
||||
from_ingestion_api=doc.from_ingestion_api,
|
||||
)
|
||||
doc_m_batch.append(db_doc_metadata)
|
||||
|
@ -202,10 +202,6 @@ class SearchFeedbackRequest(BaseModel):
|
||||
search_feedback: SearchFeedbackType
|
||||
|
||||
|
||||
class ExpertsResponse(BaseModel):
|
||||
experts: list[str]
|
||||
|
||||
|
||||
class RetrievalDocs(BaseModel):
|
||||
top_documents: list[SearchDoc]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user