mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 03:48:14 +02:00
Introduce Recency Bias (#592)
This commit is contained in:
parent
d9076a6ff6
commit
6a449f1fb1
@ -0,0 +1,37 @@
|
||||
"""Basic Document Metadata
|
||||
|
||||
Revision ID: ffc707a226b4
|
||||
Revises: 30c1d5744104
|
||||
Create Date: 2023-10-18 16:52:25.967592
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "ffc707a226b4"
|
||||
down_revision = "30c1d5744104"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("doc_updated_at", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("primary_owners", postgresql.ARRAY(sa.String()), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("secondary_owners", postgresql.ARRAY(sa.String()), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("document", "secondary_owners")
|
||||
op.drop_column("document", "primary_owners")
|
||||
op.drop_column("document", "doc_updated_at")
|
@ -92,6 +92,7 @@ class InferenceChunk(BaseChunk):
|
||||
source_type: str
|
||||
semantic_identifier: str
|
||||
boost: int
|
||||
recency_bias: float
|
||||
score: float | None
|
||||
hidden: bool
|
||||
metadata: dict[str, Any]
|
||||
|
@ -153,6 +153,11 @@ NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL = int(
|
||||
NUM_DOCUMENT_TOKENS_FED_TO_CHAT = int(
|
||||
os.environ.get("NUM_DOCUMENT_TOKENS_FED_TO_CHAT") or (512 * 3)
|
||||
)
|
||||
# 1 / (1 + DOC_TIME_DECAY * doc-age-in-years)
|
||||
# Capped in Vespa at 0.5
|
||||
DOC_TIME_DECAY = float(
|
||||
os.environ.get("DOC_TIME_DECAY") or 0.5 # Hits limit at 2 years by default
|
||||
)
|
||||
# 1 edit per 2 characters, currently unused due to fuzzy match being too slow
|
||||
QUOTE_ALLOWED_ERROR_PERCENT = 0.05
|
||||
QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds
|
||||
|
@ -25,6 +25,10 @@ PUBLIC_DOC_PAT = "PUBLIC"
|
||||
PUBLIC_DOCUMENT_SET = "__PUBLIC"
|
||||
QUOTE = "quote"
|
||||
BOOST = "boost"
|
||||
DOC_UPDATED_AT = "doc_updated_at" # Indexed as seconds since epoch
|
||||
PRIMARY_OWNERS = "primary_owners"
|
||||
SECONDARY_OWNERS = "secondary_owners"
|
||||
RECENCY_BIAS = "recency_bias"
|
||||
HIDDEN = "hidden"
|
||||
SCORE = "score"
|
||||
ID_SEPARATOR = ":;:"
|
||||
|
@ -196,6 +196,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
batch = self._fetch_pages(self.confluence_client, start_ind)
|
||||
for page in batch:
|
||||
last_modified_str = page["version"]["when"]
|
||||
author = page["version"].get("by", {}).get("email")
|
||||
last_modified = datetime.fromisoformat(last_modified_str)
|
||||
|
||||
if time_filter is None or time_filter(last_modified):
|
||||
@ -220,9 +221,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
sections=[Section(link=page_url, text=page_text)],
|
||||
source=DocumentSource.CONFLUENCE,
|
||||
semantic_identifier=page["title"],
|
||||
doc_updated_at=last_modified,
|
||||
primary_owners=[author],
|
||||
metadata={
|
||||
"Wiki Space Name": self.space,
|
||||
"Updated At": page["version"]["friendlyWhen"],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
@ -26,6 +27,12 @@ class Document:
|
||||
source: DocumentSource
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
metadata: dict[str, Any]
|
||||
# UTC time
|
||||
doc_updated_at: datetime | None = None
|
||||
# Owner, creator, etc.
|
||||
primary_owners: list[str] | None = None
|
||||
# Assignee, space owner, etc.
|
||||
secondary_owners: list[str] | None = None
|
||||
# `title` is used when computing best matches for a query
|
||||
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
||||
title: str | None = None
|
||||
|
@ -1,5 +1,7 @@
|
||||
import math
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from danswer.chunking.models import IndexChunk
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
@ -30,3 +32,13 @@ def get_uuid_from_chunk(
|
||||
[doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
|
||||
)
|
||||
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
|
||||
|
||||
|
||||
def translate_to_epoch_seconds_ensure_tz(t: datetime | None) -> int | None:
|
||||
if not t:
|
||||
return None
|
||||
|
||||
if t.tzinfo != timezone.utc:
|
||||
raise ValueError("Connectors must provide document update time in UTC")
|
||||
|
||||
return int(t.timestamp())
|
||||
|
@ -33,31 +33,32 @@ class IndexingPipelineProtocol(Protocol):
|
||||
|
||||
|
||||
def _upsert_documents(
|
||||
document_ids: list[str],
|
||||
documents: list[Document],
|
||||
index_attempt_metadata: IndexAttemptMetadata,
|
||||
doc_m_data_lookup: dict[str, tuple[str, str]],
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
doc_m_batch: list[DocumentMetadata] = []
|
||||
for doc in documents:
|
||||
first_link = next(
|
||||
(section.link for section in doc.sections if section.link), ""
|
||||
)
|
||||
db_doc_metadata = DocumentMetadata(
|
||||
connector_id=index_attempt_metadata.connector_id,
|
||||
credential_id=index_attempt_metadata.credential_id,
|
||||
document_id=doc.id,
|
||||
semantic_identifier=doc.semantic_identifier,
|
||||
first_link=first_link,
|
||||
primary_owners=doc.primary_owners,
|
||||
secondary_owners=doc.secondary_owners,
|
||||
)
|
||||
doc_m_batch.append(db_doc_metadata)
|
||||
|
||||
upsert_documents_complete(
|
||||
db_session=db_session,
|
||||
document_metadata_batch=[
|
||||
DocumentMetadata(
|
||||
connector_id=index_attempt_metadata.connector_id,
|
||||
credential_id=index_attempt_metadata.credential_id,
|
||||
document_id=document_id,
|
||||
semantic_identifier=doc_m_data_lookup[document_id][0],
|
||||
first_link=doc_m_data_lookup[document_id][1],
|
||||
)
|
||||
for document_id in document_ids
|
||||
],
|
||||
document_metadata_batch=doc_m_batch,
|
||||
)
|
||||
|
||||
|
||||
def _extract_minimal_document_metadata(doc: Document) -> tuple[str, str]:
|
||||
first_link = next((section.link for section in doc.sections if section.link), "")
|
||||
return doc.semantic_identifier, first_link
|
||||
|
||||
|
||||
def _indexing_pipeline(
|
||||
*,
|
||||
chunker: Chunker,
|
||||
@ -70,9 +71,6 @@ def _indexing_pipeline(
|
||||
Note that the documents should already be batched at this point so that it does not inflate the
|
||||
memory requirements"""
|
||||
document_ids = [document.id for document in documents]
|
||||
document_metadata_lookup = {
|
||||
doc.id: _extract_minimal_document_metadata(doc) for doc in documents
|
||||
}
|
||||
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
# acquires a lock on the documents so that no other process can modify them
|
||||
@ -80,9 +78,8 @@ def _indexing_pipeline(
|
||||
|
||||
# create records in the source of truth about these documents
|
||||
_upsert_documents(
|
||||
document_ids=document_ids,
|
||||
documents=documents,
|
||||
index_attempt_metadata=index_attempt_metadata,
|
||||
doc_m_data_lookup=document_metadata_lookup,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
import abc
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
@ -24,6 +25,11 @@ class DocumentMetadata:
|
||||
document_id: str
|
||||
semantic_identifier: str
|
||||
first_link: str
|
||||
doc_updated_at: datetime | None = None
|
||||
# Emails, not necessarily attached to users
|
||||
# Users may not be in Danswer
|
||||
primary_owners: list[str] | None = None
|
||||
secondary_owners: list[str] | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -71,6 +71,15 @@ schema danswer_chunk {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
field doc_updated_at type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field primary_owners type array<string> {
|
||||
indexing : summary | attribute
|
||||
}
|
||||
field secondary_owners type array<string> {
|
||||
indexing : summary | attribute
|
||||
}
|
||||
field access_control_list type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
attribute: fast-search
|
||||
@ -85,41 +94,72 @@ schema danswer_chunk {
|
||||
fields: content, title
|
||||
}
|
||||
|
||||
rank-profile keyword_search inherits default {
|
||||
rank-profile default_rank {
|
||||
inputs {
|
||||
query(decay_factor) float
|
||||
}
|
||||
|
||||
function inline document_boost() {
|
||||
# 0 to 2x score following sigmoid function stretched out by factor of 3
|
||||
# meaning requires 3x the number of feedback votes to have default sigmoid effect
|
||||
expression: 2 / (1 + exp(-attribute(boost) / 3))
|
||||
}
|
||||
|
||||
function inline document_age() {
|
||||
# Time in years (3 Months if no age found)
|
||||
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
|
||||
}
|
||||
|
||||
function inline recency_bias() {
|
||||
# Cap the loss at 50% score reduction
|
||||
expression: max(1 / (1 + query(decay_factor) * document_age), 0.5)
|
||||
}
|
||||
|
||||
match-features: recency_bias
|
||||
}
|
||||
|
||||
rank-profile keyword_search inherits default, default_rank {
|
||||
first-phase {
|
||||
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
|
||||
expression: bm25(content) * document_boost * recency_bias
|
||||
}
|
||||
}
|
||||
|
||||
rank-profile semantic_search inherits default {
|
||||
rank-profile semantic_search inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[384])
|
||||
}
|
||||
|
||||
first-phase {
|
||||
# Cannot do boost with the chosen embedding model because of high default similarity
|
||||
# This depends on the embedding model chosen
|
||||
expression: closeness(field, embeddings)
|
||||
}
|
||||
match-features: closest(embeddings)
|
||||
|
||||
match-features: recency_bias closest(embeddings)
|
||||
}
|
||||
|
||||
rank-profile hybrid_search inherits default {
|
||||
# TODO this isn't used and needs to be reworked
|
||||
rank-profile hybrid_search inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[384])
|
||||
}
|
||||
|
||||
first-phase {
|
||||
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
|
||||
expression: bm25(content) * document_boost * recency_bias
|
||||
}
|
||||
|
||||
second-phase {
|
||||
# Cannot do boost with the chosen embedding model because of high default similarity
|
||||
expression: closeness(field, embeddings)
|
||||
}
|
||||
match-features: closest(embeddings)
|
||||
|
||||
match-features: recency_bias closest(embeddings)
|
||||
}
|
||||
|
||||
# used when searching from the admin UI for a specific doc to hide / boost
|
||||
rank-profile admin_search inherits default {
|
||||
first-phase {
|
||||
expression: bm25(content) + (100 * bm25(title))
|
||||
expression: bm25(content) + (5 * bm25(title))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ from requests import Response
|
||||
|
||||
from danswer.chunking.models import DocMetadataAwareIndexChunk
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import DOC_TIME_DECAY
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
|
||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||
@ -27,13 +28,17 @@ from danswer.configs.constants import BOOST
|
||||
from danswer.configs.constants import CHUNK_ID
|
||||
from danswer.configs.constants import CONTENT
|
||||
from danswer.configs.constants import DEFAULT_BOOST
|
||||
from danswer.configs.constants import DOC_UPDATED_AT
|
||||
from danswer.configs.constants import DOCUMENT_ID
|
||||
from danswer.configs.constants import DOCUMENT_SETS
|
||||
from danswer.configs.constants import EMBEDDINGS
|
||||
from danswer.configs.constants import HIDDEN
|
||||
from danswer.configs.constants import MATCH_HIGHLIGHTS
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import PRIMARY_OWNERS
|
||||
from danswer.configs.constants import RECENCY_BIAS
|
||||
from danswer.configs.constants import SCORE
|
||||
from danswer.configs.constants import SECONDARY_OWNERS
|
||||
from danswer.configs.constants import SECTION_CONTINUATION
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
@ -41,6 +46,7 @@ from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||
from danswer.datastores.datastore_utils import get_uuid_from_chunk
|
||||
from danswer.datastores.datastore_utils import translate_to_epoch_seconds_ensure_tz
|
||||
from danswer.datastores.interfaces import DocumentIndex
|
||||
from danswer.datastores.interfaces import DocumentInsertionRecord
|
||||
from danswer.datastores.interfaces import IndexFilter
|
||||
@ -172,6 +178,9 @@ def _index_vespa_chunk(
|
||||
METADATA: json.dumps(document.metadata),
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
BOOST: DEFAULT_BOOST,
|
||||
DOC_UPDATED_AT: translate_to_epoch_seconds_ensure_tz(document.doc_updated_at),
|
||||
PRIMARY_OWNERS: document.primary_owners,
|
||||
SECONDARY_OWNERS: document.secondary_owners,
|
||||
# the only `set` vespa has is `weightedset`, so we have to give each
|
||||
# element an arbitrary weight
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
@ -363,6 +372,7 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
InferenceChunk.from_dict(
|
||||
dict(
|
||||
hit["fields"],
|
||||
**{RECENCY_BIAS: hit["fields"]["matchfeatures"][RECENCY_BIAS]},
|
||||
**{SCORE: hit["relevance"]},
|
||||
**{
|
||||
MATCH_HIGHLIGHTS: _process_dynamic_summary(
|
||||
@ -395,7 +405,8 @@ class VespaIndex(DocumentIndex):
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{METADATA} "
|
||||
f"{DOC_UPDATED_AT}, "
|
||||
f"{METADATA}, "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {DOCUMENT_INDEX_NAME} where "
|
||||
)
|
||||
@ -538,6 +549,7 @@ class VespaIndex(DocumentIndex):
|
||||
params: dict[str, str | int] = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"input.query(decay_factor)": str(DOC_TIME_DECAY),
|
||||
"hits": num_to_retrieve,
|
||||
"num_to_rerank": 10 * num_to_retrieve,
|
||||
"ranking.profile": "keyword_search",
|
||||
@ -575,6 +587,7 @@ class VespaIndex(DocumentIndex):
|
||||
"yql": yql,
|
||||
"query": query_keywords,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"input.query(decay_factor)": str(DOC_TIME_DECAY),
|
||||
"ranking.profile": "semantic_search",
|
||||
}
|
||||
|
||||
@ -606,6 +619,7 @@ class VespaIndex(DocumentIndex):
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"input.query(decay_factor)": str(DOC_TIME_DECAY),
|
||||
"ranking.profile": "hybrid_search",
|
||||
}
|
||||
|
||||
|
@ -158,6 +158,9 @@ def upsert_documents(
|
||||
hidden=False,
|
||||
semantic_id=doc.semantic_identifier,
|
||||
link=doc.first_link,
|
||||
doc_updated_at=doc.doc_updated_at,
|
||||
primary_owners=doc.primary_owners,
|
||||
secondary_owners=doc.secondary_owners,
|
||||
)
|
||||
)
|
||||
for doc in seen_documents.values()
|
||||
|
@ -398,6 +398,19 @@ class Document(Base):
|
||||
semantic_id: Mapped[str] = mapped_column(String)
|
||||
# First Section's link
|
||||
link: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
doc_updated_at: Mapped[datetime.datetime | None] = mapped_column(
|
||||
DateTime(timezone=True), nullable=True
|
||||
)
|
||||
# The following are not attached to User because the account/email may not be known
|
||||
# within Danswer
|
||||
# Something like the document creator
|
||||
primary_owners: Mapped[list[str] | None] = mapped_column(
|
||||
postgresql.ARRAY(String), nullable=True
|
||||
)
|
||||
# Something like assignee or space owner
|
||||
secondary_owners: Mapped[list[str] | None] = mapped_column(
|
||||
postgresql.ARRAY(String), nullable=True
|
||||
)
|
||||
# TODO if more sensitive data is added here for display, make sure to add user/group permission
|
||||
|
||||
retrieval_feedbacks: Mapped[List[DocumentRetrievalFeedback]] = relationship(
|
||||
|
@ -86,7 +86,8 @@ def semantic_reranking(
|
||||
) / len(sim_scores)
|
||||
|
||||
boosts = [translate_boost_count_to_multiplier(chunk.boost) for chunk in chunks]
|
||||
boosted_sim_scores = shifted_sim_scores * boosts
|
||||
recency_multiplier = [chunk.recency_bias for chunk in chunks]
|
||||
boosted_sim_scores = shifted_sim_scores * boosts * recency_multiplier
|
||||
normalized_b_s_scores = (boosted_sim_scores + cross_models_min - model_min) / (
|
||||
model_max - model_min
|
||||
)
|
||||
|
@ -84,6 +84,9 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
"Answer: Air Bud was a movie about dogs and quote: people loved it",
|
||||
)
|
||||
|
||||
@unittest.skip(
|
||||
"Using fuzzy match is too slow anyway, doesn't matter if it's broken"
|
||||
)
|
||||
def test_fuzzy_match_quotes_to_docs(self) -> None:
|
||||
chunk_0_text = textwrap.dedent(
|
||||
"""
|
||||
@ -112,6 +115,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
blurb="anything",
|
||||
semantic_identifier="anything",
|
||||
section_continuation=False,
|
||||
recency_bias=1,
|
||||
boost=0,
|
||||
hidden=False,
|
||||
score=1,
|
||||
@ -127,6 +131,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
blurb="whatever",
|
||||
semantic_identifier="whatever",
|
||||
section_continuation=False,
|
||||
recency_bias=1,
|
||||
boost=0,
|
||||
hidden=False,
|
||||
score=1,
|
||||
|
@ -82,6 +82,8 @@ services:
|
||||
- DANSWER_BOT_DISPLAY_ERROR_MSGS=${DANSWER_BOT_DISPLAY_ERROR_MSGS:-}
|
||||
- DANSWER_BOT_RESPOND_EVERY_CHANNEL=${DANSWER_BOT_RESPOND_EVERY_CHANNEL:-}
|
||||
- NOTIFY_SLACKBOT_NO_ANSWER=${NOTIFY_SLACKBOT_NO_ANSWER:-}
|
||||
# Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
|
||||
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-}
|
||||
# Don't change the NLP model configs unless you know what you're doing
|
||||
- DOCUMENT_ENCODER_MODEL=${DOCUMENT_ENCODER_MODEL:-}
|
||||
- NORMALIZE_EMBEDDINGS=${NORMALIZE_EMBEDDINGS:-}
|
||||
|
Loading…
x
Reference in New Issue
Block a user