mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-18 12:00:58 +02:00
Update connector interface with optional Owners information (#798)
This commit is contained in:
parent
17c2f06338
commit
8020db9e9a
@ -1,11 +1,11 @@
|
|||||||
"""Experimental functionality related to splitting up indexing
|
"""Experimental functionality related to splitting up indexing
|
||||||
into a series of checkpoints to better handle intermmittent failures
|
into a series of checkpoints to better handle intermittent failures
|
||||||
/ jobs being killed by cloud providers."""
|
/ jobs being killed by cloud providers."""
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
from danswer.configs.app_configs import EXPERIMENTAL_CHECKPOINTING_ENABLED
|
from danswer.configs.app_configs import EXPERIMENTAL_CHECKPOINTING_ENABLED
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.time_utils import datetime_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
|
||||||
|
|
||||||
|
|
||||||
def _2010_dt() -> datetime.datetime:
|
def _2010_dt() -> datetime.datetime:
|
||||||
|
@ -18,6 +18,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
|
|||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
from danswer.connectors.models import BasicExpertInfo
|
||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
@ -294,7 +295,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
source=DocumentSource.CONFLUENCE,
|
source=DocumentSource.CONFLUENCE,
|
||||||
semantic_identifier=page["title"],
|
semantic_identifier=page["title"],
|
||||||
doc_updated_at=last_modified,
|
doc_updated_at=last_modified,
|
||||||
primary_owners=[author] if author else None,
|
primary_owners=[BasicExpertInfo(email=author)]
|
||||||
|
if author
|
||||||
|
else None,
|
||||||
metadata={
|
metadata={
|
||||||
"Wiki Space Name": self.space,
|
"Wiki Space Name": self.space,
|
||||||
},
|
},
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
|
||||||
|
from dateutil.parser import parse
|
||||||
|
|
||||||
|
from danswer.connectors.models import BasicExpertInfo
|
||||||
|
from danswer.utils.text_processing import is_valid_email
|
||||||
|
|
||||||
|
|
||||||
|
def datetime_to_utc(dt: datetime) -> datetime:
|
||||||
|
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
return dt.astimezone(timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def time_str_to_utc(datetime_str: str) -> datetime:
|
||||||
|
dt = parse(datetime_str)
|
||||||
|
return datetime_to_utc(dt)
|
||||||
|
|
||||||
|
|
||||||
|
def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
|
||||||
|
if info.first_name and info.last_name:
|
||||||
|
return f"{info.first_name} {info.middle_initial} {info.last_name}"
|
||||||
|
|
||||||
|
if info.display_name:
|
||||||
|
return info.display_name
|
||||||
|
|
||||||
|
if info.email and is_valid_email(info.email):
|
||||||
|
return info.email
|
||||||
|
|
||||||
|
if info.first_name:
|
||||||
|
return info.first_name
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_experts_stores_representations(
|
||||||
|
experts: list[BasicExpertInfo] | None,
|
||||||
|
) -> list[str] | None:
|
||||||
|
if not experts:
|
||||||
|
return None
|
||||||
|
|
||||||
|
reps = [basic_expert_info_representation(owner) for owner in experts]
|
||||||
|
return [owner for owner in reps if owner is not None]
|
@ -1,16 +0,0 @@
|
|||||||
from datetime import datetime
|
|
||||||
from datetime import timezone
|
|
||||||
|
|
||||||
from dateutil.parser import parse
|
|
||||||
|
|
||||||
|
|
||||||
def datetime_to_utc(dt: datetime) -> datetime:
|
|
||||||
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
|
||||||
dt = dt.replace(tzinfo=timezone.utc)
|
|
||||||
|
|
||||||
return dt.astimezone(timezone.utc)
|
|
||||||
|
|
||||||
|
|
||||||
def time_str_to_utc(datetime_str: str) -> datetime:
|
|
||||||
dt = parse(datetime_str)
|
|
||||||
return datetime_to_utc(dt)
|
|
@ -12,7 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
|||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||||
from danswer.connectors.file.utils import get_file_ext
|
from danswer.connectors.file.utils import get_file_ext
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
|
@ -8,7 +8,7 @@ import requests
|
|||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
|
@ -27,6 +27,22 @@ class Section(BaseModel):
|
|||||||
link: str | None
|
link: str | None
|
||||||
|
|
||||||
|
|
||||||
|
class BasicExpertInfo(BaseModel):
|
||||||
|
"""Basic Information for the owner of a document, any of the fields can be left as None
|
||||||
|
Display fallback goes as follows:
|
||||||
|
- first_name + (optional middle_initial) + last_name
|
||||||
|
- display_name
|
||||||
|
- email
|
||||||
|
- first_name
|
||||||
|
"""
|
||||||
|
|
||||||
|
display_name: str | None = None
|
||||||
|
first_name: str | None = None
|
||||||
|
middle_initial: str | None = None
|
||||||
|
last_name: str | None = None
|
||||||
|
email: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class DocumentBase(BaseModel):
|
class DocumentBase(BaseModel):
|
||||||
"""Used for Danswer ingestion api, the ID is inferred before use if not provided"""
|
"""Used for Danswer ingestion api, the ID is inferred before use if not provided"""
|
||||||
|
|
||||||
@ -38,9 +54,9 @@ class DocumentBase(BaseModel):
|
|||||||
# UTC time
|
# UTC time
|
||||||
doc_updated_at: datetime | None = None
|
doc_updated_at: datetime | None = None
|
||||||
# Owner, creator, etc.
|
# Owner, creator, etc.
|
||||||
primary_owners: list[str] | None = None
|
primary_owners: list[BasicExpertInfo] | None = None
|
||||||
# Assignee, space owner, etc.
|
# Assignee, space owner, etc.
|
||||||
secondary_owners: list[str] | None = None
|
secondary_owners: list[BasicExpertInfo] | None = None
|
||||||
# `title` is used when computing best matches for a query
|
# `title` is used when computing best matches for a query
|
||||||
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
||||||
title: str | None = None
|
title: str | None = None
|
||||||
|
@ -10,7 +10,7 @@ from retry import retry
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
@ -23,7 +23,6 @@ from danswer.direct_qa.interfaces import StreamingError
|
|||||||
from danswer.direct_qa.models import LLMMetricsContainer
|
from danswer.direct_qa.models import LLMMetricsContainer
|
||||||
from danswer.direct_qa.qa_utils import get_chunks_for_qa
|
from danswer.direct_qa.qa_utils import get_chunks_for_qa
|
||||||
from danswer.document_index.factory import get_default_document_index
|
from danswer.document_index.factory import get_default_document_index
|
||||||
from danswer.expert_recommendation.heuristics_based import extract_experts
|
|
||||||
from danswer.indexing.models import InferenceChunk
|
from danswer.indexing.models import InferenceChunk
|
||||||
from danswer.search.models import QueryFlow
|
from danswer.search.models import QueryFlow
|
||||||
from danswer.search.models import RerankMetricsContainer
|
from danswer.search.models import RerankMetricsContainer
|
||||||
@ -34,7 +33,6 @@ from danswer.search.search_runner import chunks_to_search_docs
|
|||||||
from danswer.search.search_runner import full_chunk_search
|
from danswer.search.search_runner import full_chunk_search
|
||||||
from danswer.search.search_runner import full_chunk_search_generator
|
from danswer.search.search_runner import full_chunk_search_generator
|
||||||
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
|
from danswer.secondary_llm_flows.answer_validation import get_answer_validity
|
||||||
from danswer.server.models import ExpertsResponse
|
|
||||||
from danswer.server.models import LLMRelevanceFilterResponse
|
from danswer.server.models import LLMRelevanceFilterResponse
|
||||||
from danswer.server.models import NewMessageRequest
|
from danswer.server.models import NewMessageRequest
|
||||||
from danswer.server.models import QADocsResponse
|
from danswer.server.models import QADocsResponse
|
||||||
@ -244,10 +242,6 @@ def answer_qa_query_stream(
|
|||||||
# immediately see some results
|
# immediately see some results
|
||||||
top_chunks = cast(list[InferenceChunk], next(search_generator))
|
top_chunks = cast(list[InferenceChunk], next(search_generator))
|
||||||
|
|
||||||
expert_emails = extract_experts(top_chunks)
|
|
||||||
expert_response = ExpertsResponse(experts=expert_emails).dict()
|
|
||||||
yield get_json_line(expert_response)
|
|
||||||
|
|
||||||
top_docs = chunks_to_search_docs(top_chunks)
|
top_docs = chunks_to_search_docs(top_chunks)
|
||||||
initial_response = QADocsResponse(
|
initial_response = QADocsResponse(
|
||||||
top_documents=top_docs,
|
top_documents=top_docs,
|
||||||
|
@ -47,6 +47,9 @@ from danswer.configs.constants import SOURCE_LINKS
|
|||||||
from danswer.configs.constants import SOURCE_TYPE
|
from danswer.configs.constants import SOURCE_TYPE
|
||||||
from danswer.configs.constants import TITLE
|
from danswer.configs.constants import TITLE
|
||||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||||
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||||
|
get_experts_stores_representations,
|
||||||
|
)
|
||||||
from danswer.document_index.document_index_utils import get_uuid_from_chunk
|
from danswer.document_index.document_index_utils import get_uuid_from_chunk
|
||||||
from danswer.document_index.interfaces import DocumentIndex
|
from danswer.document_index.interfaces import DocumentIndex
|
||||||
from danswer.document_index.interfaces import DocumentInsertionRecord
|
from danswer.document_index.interfaces import DocumentInsertionRecord
|
||||||
@ -240,8 +243,8 @@ def _index_vespa_chunk(chunk: DocMetadataAwareIndexChunk) -> None:
|
|||||||
EMBEDDINGS: embeddings_name_vector_map,
|
EMBEDDINGS: embeddings_name_vector_map,
|
||||||
BOOST: DEFAULT_BOOST,
|
BOOST: DEFAULT_BOOST,
|
||||||
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
||||||
PRIMARY_OWNERS: document.primary_owners,
|
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
||||||
SECONDARY_OWNERS: document.secondary_owners,
|
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
|
||||||
# the only `set` vespa has is `weightedset`, so we have to give each
|
# the only `set` vespa has is `weightedset`, so we have to give each
|
||||||
# element an arbitrary weight
|
# element an arbitrary weight
|
||||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||||
|
@ -1,35 +0,0 @@
|
|||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from danswer.indexing.models import InferenceChunk
|
|
||||||
from danswer.utils.text_processing import is_valid_email
|
|
||||||
|
|
||||||
# What is the minimum cumulative score for a user to be considered an Expert
|
|
||||||
# If a set of 50 results is shown, user needs a cumulative doc score of 2.5 to be an expert
|
|
||||||
_EXPERT_SCORE_RATIO = 2.5 / 50
|
|
||||||
# How much should a score be discounted if the user is not the primary owner
|
|
||||||
_SECONDARY_OWNER_DISCOUNT = 0.5
|
|
||||||
|
|
||||||
|
|
||||||
def extract_experts(
|
|
||||||
chunks: list[InferenceChunk], score_ratio: float = _EXPERT_SCORE_RATIO
|
|
||||||
) -> list[str]:
|
|
||||||
target_score = score_ratio * len(chunks)
|
|
||||||
|
|
||||||
expert_scores: dict[str, float] = defaultdict(float)
|
|
||||||
|
|
||||||
for chunk in chunks:
|
|
||||||
if chunk.primary_owners:
|
|
||||||
for p_owner in chunk.primary_owners:
|
|
||||||
if chunk.score:
|
|
||||||
expert_scores[p_owner] += chunk.score
|
|
||||||
|
|
||||||
if chunk.secondary_owners:
|
|
||||||
for s_owner in chunk.secondary_owners:
|
|
||||||
if chunk.score:
|
|
||||||
expert_scores[s_owner] += _SECONDARY_OWNER_DISCOUNT * chunk.score
|
|
||||||
|
|
||||||
return [
|
|
||||||
owner
|
|
||||||
for owner, score in expert_scores.items()
|
|
||||||
if score >= target_score and is_valid_email(owner)
|
|
||||||
]
|
|
@ -5,6 +5,9 @@ from typing import Protocol
|
|||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from danswer.access.access import get_access_for_documents
|
from danswer.access.access import get_access_for_documents
|
||||||
|
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||||
|
get_experts_stores_representations,
|
||||||
|
)
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import IndexAttemptMetadata
|
from danswer.connectors.models import IndexAttemptMetadata
|
||||||
from danswer.db.document import get_documents_by_ids
|
from danswer.db.document import get_documents_by_ids
|
||||||
@ -50,8 +53,8 @@ def upsert_documents_in_db(
|
|||||||
document_id=doc.id,
|
document_id=doc.id,
|
||||||
semantic_identifier=doc.semantic_identifier,
|
semantic_identifier=doc.semantic_identifier,
|
||||||
first_link=first_link,
|
first_link=first_link,
|
||||||
primary_owners=doc.primary_owners,
|
primary_owners=get_experts_stores_representations(doc.primary_owners),
|
||||||
secondary_owners=doc.secondary_owners,
|
secondary_owners=get_experts_stores_representations(doc.secondary_owners),
|
||||||
from_ingestion_api=doc.from_ingestion_api,
|
from_ingestion_api=doc.from_ingestion_api,
|
||||||
)
|
)
|
||||||
doc_m_batch.append(db_doc_metadata)
|
doc_m_batch.append(db_doc_metadata)
|
||||||
|
@ -202,10 +202,6 @@ class SearchFeedbackRequest(BaseModel):
|
|||||||
search_feedback: SearchFeedbackType
|
search_feedback: SearchFeedbackType
|
||||||
|
|
||||||
|
|
||||||
class ExpertsResponse(BaseModel):
|
|
||||||
experts: list[str]
|
|
||||||
|
|
||||||
|
|
||||||
class RetrievalDocs(BaseModel):
|
class RetrievalDocs(BaseModel):
|
||||||
top_documents: list[SearchDoc]
|
top_documents: list[SearchDoc]
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user