Introduce Recency Bias (#592)

This commit is contained in:
Yuhong Sun 2023-10-19 12:54:35 -07:00 committed by GitHub
parent d9076a6ff6
commit 6a449f1fb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 182 additions and 33 deletions

View File

@ -0,0 +1,37 @@
"""Basic Document Metadata
Revision ID: ffc707a226b4
Revises: 30c1d5744104
Create Date: 2023-10-18 16:52:25.967592
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = "ffc707a226b4"
down_revision = "30c1d5744104"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"document",
sa.Column("doc_updated_at", sa.DateTime(timezone=True), nullable=True),
)
op.add_column(
"document",
sa.Column("primary_owners", postgresql.ARRAY(sa.String()), nullable=True),
)
op.add_column(
"document",
sa.Column("secondary_owners", postgresql.ARRAY(sa.String()), nullable=True),
)
def downgrade() -> None:
op.drop_column("document", "secondary_owners")
op.drop_column("document", "primary_owners")
op.drop_column("document", "doc_updated_at")

View File

@ -92,6 +92,7 @@ class InferenceChunk(BaseChunk):
source_type: str
semantic_identifier: str
boost: int
recency_bias: float
score: float | None
hidden: bool
metadata: dict[str, Any]

View File

@ -153,6 +153,11 @@ NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL = int(
NUM_DOCUMENT_TOKENS_FED_TO_CHAT = int(
os.environ.get("NUM_DOCUMENT_TOKENS_FED_TO_CHAT") or (512 * 3)
)
# 1 / (1 + DOC_TIME_DECAY * doc-age-in-years)
# Capped in Vespa at 0.5
DOC_TIME_DECAY = float(
os.environ.get("DOC_TIME_DECAY") or 0.5 # Hits limit at 2 years by default
)
# 1 edit per 2 characters, currently unused due to fuzzy match being too slow
QUOTE_ALLOWED_ERROR_PERCENT = 0.05
QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds

View File

@ -25,6 +25,10 @@ PUBLIC_DOC_PAT = "PUBLIC"
PUBLIC_DOCUMENT_SET = "__PUBLIC"
QUOTE = "quote"
BOOST = "boost"
DOC_UPDATED_AT = "doc_updated_at" # Indexed as seconds since epoch
PRIMARY_OWNERS = "primary_owners"
SECONDARY_OWNERS = "secondary_owners"
RECENCY_BIAS = "recency_bias"
HIDDEN = "hidden"
SCORE = "score"
ID_SEPARATOR = ":;:"

View File

@ -196,6 +196,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
batch = self._fetch_pages(self.confluence_client, start_ind)
for page in batch:
last_modified_str = page["version"]["when"]
author = page["version"].get("by", {}).get("email")
last_modified = datetime.fromisoformat(last_modified_str)
if time_filter is None or time_filter(last_modified):
@ -220,9 +221,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
sections=[Section(link=page_url, text=page_text)],
source=DocumentSource.CONFLUENCE,
semantic_identifier=page["title"],
doc_updated_at=last_modified,
primary_owners=[author],
metadata={
"Wiki Space Name": self.space,
"Updated At": page["version"]["friendlyWhen"],
},
)
)

View File

@ -1,4 +1,5 @@
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Any
@ -26,6 +27,12 @@ class Document:
source: DocumentSource
semantic_identifier: str # displayed in the UI as the main identifier for the doc
metadata: dict[str, Any]
# UTC time
doc_updated_at: datetime | None = None
# Owner, creator, etc.
primary_owners: list[str] | None = None
# Assignee, space owner, etc.
secondary_owners: list[str] | None = None
# `title` is used when computing best matches for a query
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
title: str | None = None

View File

@ -1,5 +1,7 @@
import math
import uuid
from datetime import datetime
from datetime import timezone
from danswer.chunking.models import IndexChunk
from danswer.chunking.models import InferenceChunk
@ -30,3 +32,13 @@ def get_uuid_from_chunk(
[doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
)
return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
def translate_to_epoch_seconds_ensure_tz(t: datetime | None) -> int | None:
if not t:
return None
if t.tzinfo != timezone.utc:
raise ValueError("Connectors must provide document update time in UTC")
return int(t.timestamp())

View File

@ -33,31 +33,32 @@ class IndexingPipelineProtocol(Protocol):
def _upsert_documents(
document_ids: list[str],
documents: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
doc_m_data_lookup: dict[str, tuple[str, str]],
db_session: Session,
) -> None:
doc_m_batch: list[DocumentMetadata] = []
for doc in documents:
first_link = next(
(section.link for section in doc.sections if section.link), ""
)
db_doc_metadata = DocumentMetadata(
connector_id=index_attempt_metadata.connector_id,
credential_id=index_attempt_metadata.credential_id,
document_id=doc.id,
semantic_identifier=doc.semantic_identifier,
first_link=first_link,
primary_owners=doc.primary_owners,
secondary_owners=doc.secondary_owners,
)
doc_m_batch.append(db_doc_metadata)
upsert_documents_complete(
db_session=db_session,
document_metadata_batch=[
DocumentMetadata(
connector_id=index_attempt_metadata.connector_id,
credential_id=index_attempt_metadata.credential_id,
document_id=document_id,
semantic_identifier=doc_m_data_lookup[document_id][0],
first_link=doc_m_data_lookup[document_id][1],
)
for document_id in document_ids
],
document_metadata_batch=doc_m_batch,
)
def _extract_minimal_document_metadata(doc: Document) -> tuple[str, str]:
first_link = next((section.link for section in doc.sections if section.link), "")
return doc.semantic_identifier, first_link
def _indexing_pipeline(
*,
chunker: Chunker,
@ -70,9 +71,6 @@ def _indexing_pipeline(
Note that the documents should already be batched at this point so that it does not inflate the
memory requirements"""
document_ids = [document.id for document in documents]
document_metadata_lookup = {
doc.id: _extract_minimal_document_metadata(doc) for doc in documents
}
with Session(get_sqlalchemy_engine()) as db_session:
# acquires a lock on the documents so that no other process can modify them
@ -80,9 +78,8 @@ def _indexing_pipeline(
# create records in the source of truth about these documents
_upsert_documents(
document_ids=document_ids,
documents=documents,
index_attempt_metadata=index_attempt_metadata,
doc_m_data_lookup=document_metadata_lookup,
db_session=db_session,
)

View File

@ -1,5 +1,6 @@
import abc
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from uuid import UUID
@ -24,6 +25,11 @@ class DocumentMetadata:
document_id: str
semantic_identifier: str
first_link: str
doc_updated_at: datetime | None = None
# Emails, not necessarily attached to users
# Users may not be in Danswer
primary_owners: list[str] | None = None
secondary_owners: list[str] | None = None
@dataclass

View File

@ -71,6 +71,15 @@ schema danswer_chunk {
distance-metric: angular
}
}
field doc_updated_at type int {
indexing: summary | attribute
}
field primary_owners type array<string> {
indexing : summary | attribute
}
field secondary_owners type array<string> {
indexing : summary | attribute
}
field access_control_list type weightedset<string> {
indexing: summary | attribute
attribute: fast-search
@ -85,41 +94,72 @@ schema danswer_chunk {
fields: content, title
}
rank-profile keyword_search inherits default {
rank-profile default_rank {
inputs {
query(decay_factor) float
}
function inline document_boost() {
# 0 to 2x score following sigmoid function stretched out by factor of 3
# meaning requires 3x the number of feedback votes to have default sigmoid effect
expression: 2 / (1 + exp(-attribute(boost) / 3))
}
function inline document_age() {
# Time in years (3 Months if no age found)
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
}
function inline recency_bias() {
# Cap the loss at 50% score reduction
expression: max(1 / (1 + query(decay_factor) * document_age), 0.5)
}
match-features: recency_bias
}
rank-profile keyword_search inherits default, default_rank {
first-phase {
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
expression: bm25(content) * document_boost * recency_bias
}
}
rank-profile semantic_search inherits default {
rank-profile semantic_search inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[384])
}
first-phase {
# Cannot do boost with the chosen embedding model because of high default similarity
# This depends on the embedding model chosen
expression: closeness(field, embeddings)
}
match-features: closest(embeddings)
match-features: recency_bias closest(embeddings)
}
rank-profile hybrid_search inherits default {
# TODO this isn't used and needs to be reworked
rank-profile hybrid_search inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[384])
}
first-phase {
expression: bm25(content) * (2 / (1 + exp(-attribute(boost) / 3)))
expression: bm25(content) * document_boost * recency_bias
}
second-phase {
# Cannot do boost with the chosen embedding model because of high default similarity
expression: closeness(field, embeddings)
}
match-features: closest(embeddings)
match-features: recency_bias closest(embeddings)
}
# used when searching from the admin UI for a specific doc to hide / boost
rank-profile admin_search inherits default {
first-phase {
expression: bm25(content) + (100 * bm25(title))
expression: bm25(content) + (5 * bm25(title))
}
}
}

View File

@ -14,6 +14,7 @@ from requests import Response
from danswer.chunking.models import DocMetadataAwareIndexChunk
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DOC_TIME_DECAY
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
from danswer.configs.app_configs import NUM_RETURNED_HITS
@ -27,13 +28,17 @@ from danswer.configs.constants import BOOST
from danswer.configs.constants import CHUNK_ID
from danswer.configs.constants import CONTENT
from danswer.configs.constants import DEFAULT_BOOST
from danswer.configs.constants import DOC_UPDATED_AT
from danswer.configs.constants import DOCUMENT_ID
from danswer.configs.constants import DOCUMENT_SETS
from danswer.configs.constants import EMBEDDINGS
from danswer.configs.constants import HIDDEN
from danswer.configs.constants import MATCH_HIGHLIGHTS
from danswer.configs.constants import METADATA
from danswer.configs.constants import PRIMARY_OWNERS
from danswer.configs.constants import RECENCY_BIAS
from danswer.configs.constants import SCORE
from danswer.configs.constants import SECONDARY_OWNERS
from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
@ -41,6 +46,7 @@ from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.datastores.datastore_utils import get_uuid_from_chunk
from danswer.datastores.datastore_utils import translate_to_epoch_seconds_ensure_tz
from danswer.datastores.interfaces import DocumentIndex
from danswer.datastores.interfaces import DocumentInsertionRecord
from danswer.datastores.interfaces import IndexFilter
@ -172,6 +178,9 @@ def _index_vespa_chunk(
METADATA: json.dumps(document.metadata),
EMBEDDINGS: embeddings_name_vector_map,
BOOST: DEFAULT_BOOST,
DOC_UPDATED_AT: translate_to_epoch_seconds_ensure_tz(document.doc_updated_at),
PRIMARY_OWNERS: document.primary_owners,
SECONDARY_OWNERS: document.secondary_owners,
# the only `set` vespa has is `weightedset`, so we have to give each
# element an arbitrary weight
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
@ -363,6 +372,7 @@ def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
InferenceChunk.from_dict(
dict(
hit["fields"],
**{RECENCY_BIAS: hit["fields"]["matchfeatures"][RECENCY_BIAS]},
**{SCORE: hit["relevance"]},
**{
MATCH_HIGHLIGHTS: _process_dynamic_summary(
@ -395,7 +405,8 @@ class VespaIndex(DocumentIndex):
f"{SECTION_CONTINUATION}, "
f"{BOOST}, "
f"{HIDDEN}, "
f"{METADATA} "
f"{DOC_UPDATED_AT}, "
f"{METADATA}, "
f"{CONTENT_SUMMARY} "
f"from {DOCUMENT_INDEX_NAME} where "
)
@ -538,6 +549,7 @@ class VespaIndex(DocumentIndex):
params: dict[str, str | int] = {
"yql": yql,
"query": query,
"input.query(decay_factor)": str(DOC_TIME_DECAY),
"hits": num_to_retrieve,
"num_to_rerank": 10 * num_to_retrieve,
"ranking.profile": "keyword_search",
@ -575,6 +587,7 @@ class VespaIndex(DocumentIndex):
"yql": yql,
"query": query_keywords,
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY),
"ranking.profile": "semantic_search",
}
@ -606,6 +619,7 @@ class VespaIndex(DocumentIndex):
"yql": yql,
"query": query,
"input.query(query_embedding)": str(query_embedding),
"input.query(decay_factor)": str(DOC_TIME_DECAY),
"ranking.profile": "hybrid_search",
}

View File

@ -158,6 +158,9 @@ def upsert_documents(
hidden=False,
semantic_id=doc.semantic_identifier,
link=doc.first_link,
doc_updated_at=doc.doc_updated_at,
primary_owners=doc.primary_owners,
secondary_owners=doc.secondary_owners,
)
)
for doc in seen_documents.values()

View File

@ -398,6 +398,19 @@ class Document(Base):
semantic_id: Mapped[str] = mapped_column(String)
# First Section's link
link: Mapped[str | None] = mapped_column(String, nullable=True)
doc_updated_at: Mapped[datetime.datetime | None] = mapped_column(
DateTime(timezone=True), nullable=True
)
# The following are not attached to User because the account/email may not be known
# within Danswer
# Something like the document creator
primary_owners: Mapped[list[str] | None] = mapped_column(
postgresql.ARRAY(String), nullable=True
)
# Something like assignee or space owner
secondary_owners: Mapped[list[str] | None] = mapped_column(
postgresql.ARRAY(String), nullable=True
)
# TODO if more sensitive data is added here for display, make sure to add user/group permission
retrieval_feedbacks: Mapped[List[DocumentRetrievalFeedback]] = relationship(

View File

@ -86,7 +86,8 @@ def semantic_reranking(
) / len(sim_scores)
boosts = [translate_boost_count_to_multiplier(chunk.boost) for chunk in chunks]
boosted_sim_scores = shifted_sim_scores * boosts
recency_multiplier = [chunk.recency_bias for chunk in chunks]
boosted_sim_scores = shifted_sim_scores * boosts * recency_multiplier
normalized_b_s_scores = (boosted_sim_scores + cross_models_min - model_min) / (
model_max - model_min
)

View File

@ -84,6 +84,9 @@ class TestQAPostprocessing(unittest.TestCase):
"Answer: Air Bud was a movie about dogs and quote: people loved it",
)
@unittest.skip(
"Using fuzzy match is too slow anyway, doesn't matter if it's broken"
)
def test_fuzzy_match_quotes_to_docs(self) -> None:
chunk_0_text = textwrap.dedent(
"""
@ -112,6 +115,7 @@ class TestQAPostprocessing(unittest.TestCase):
blurb="anything",
semantic_identifier="anything",
section_continuation=False,
recency_bias=1,
boost=0,
hidden=False,
score=1,
@ -127,6 +131,7 @@ class TestQAPostprocessing(unittest.TestCase):
blurb="whatever",
semantic_identifier="whatever",
section_continuation=False,
recency_bias=1,
boost=0,
hidden=False,
score=1,

View File

@ -82,6 +82,8 @@ services:
- DANSWER_BOT_DISPLAY_ERROR_MSGS=${DANSWER_BOT_DISPLAY_ERROR_MSGS:-}
- DANSWER_BOT_RESPOND_EVERY_CHANNEL=${DANSWER_BOT_RESPOND_EVERY_CHANNEL:-}
- NOTIFY_SLACKBOT_NO_ANSWER=${NOTIFY_SLACKBOT_NO_ANSWER:-}
# Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-}
# Don't change the NLP model configs unless you know what you're doing
- DOCUMENT_ENCODER_MODEL=${DOCUMENT_ENCODER_MODEL:-}
- NORMALIZE_EMBEDDINGS=${NORMALIZE_EMBEDDINGS:-}