mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-08 13:40:46 +02:00
add user files (#4152)
This commit is contained in:
@ -104,6 +104,16 @@ class VespaDocumentFields:
|
||||
aggregated_chunk_boost_factor: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class VespaDocumentUserFields:
|
||||
"""
|
||||
Fields that are specific to the user who is indexing the document.
|
||||
"""
|
||||
|
||||
user_file_id: str | None = None
|
||||
user_folder_id: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpdateRequest:
|
||||
"""
|
||||
@ -258,7 +268,8 @@ class Updatable(abc.ABC):
|
||||
*,
|
||||
tenant_id: str,
|
||||
chunk_count: int | None,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
) -> int:
|
||||
"""
|
||||
Updates all chunks for a document with the specified fields.
|
||||
|
@ -120,12 +120,22 @@ schema DANSWER_CHUNK_NAME {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
field document_sets type weightedset<string> {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field user_file type int {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field user_folder type int {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
attribute: fast-search
|
||||
}
|
||||
}
|
||||
|
||||
# If using different tokenization settings, the fieldset has to be removed, and the field must
|
||||
|
@ -36,6 +36,7 @@ from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
|
||||
from onyx.document_index.interfaces import UpdateRequest
|
||||
from onyx.document_index.interfaces import VespaChunkRequest
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.document_index.interfaces import VespaDocumentUserFields
|
||||
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
|
||||
from onyx.document_index.vespa.chunk_retrieval import (
|
||||
parallel_visit_api_retrieval,
|
||||
@ -70,6 +71,8 @@ from onyx.document_index.vespa_constants import NUM_THREADS
|
||||
from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_PAT
|
||||
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
|
||||
from onyx.document_index.vespa_constants import VESPA_TIMEOUT
|
||||
@ -592,7 +595,8 @@ class VespaIndex(DocumentIndex):
|
||||
self,
|
||||
doc_chunk_id: UUID,
|
||||
index_name: str,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
doc_id: str,
|
||||
http_client: httpx.Client,
|
||||
) -> None:
|
||||
@ -603,21 +607,31 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
update_dict: dict[str, dict] = {"fields": {}}
|
||||
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
if fields is not None:
|
||||
if fields.boost is not None:
|
||||
update_dict["fields"][BOOST] = {"assign": fields.boost}
|
||||
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
if fields.document_sets is not None:
|
||||
update_dict["fields"][DOCUMENT_SETS] = {
|
||||
"assign": {document_set: 1 for document_set in fields.document_sets}
|
||||
}
|
||||
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
if fields.access is not None:
|
||||
update_dict["fields"][ACCESS_CONTROL_LIST] = {
|
||||
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
|
||||
}
|
||||
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
if fields.hidden is not None:
|
||||
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
|
||||
|
||||
if user_fields is not None:
|
||||
if user_fields.user_file_id is not None:
|
||||
update_dict["fields"][USER_FILE] = {"assign": user_fields.user_file_id}
|
||||
|
||||
if user_fields.user_folder_id is not None:
|
||||
update_dict["fields"][USER_FOLDER] = {
|
||||
"assign": user_fields.user_folder_id
|
||||
}
|
||||
|
||||
if not update_dict["fields"]:
|
||||
logger.error("Update request received but nothing to update.")
|
||||
@ -649,7 +663,8 @@ class VespaIndex(DocumentIndex):
|
||||
*,
|
||||
chunk_count: int | None,
|
||||
tenant_id: str,
|
||||
fields: VespaDocumentFields,
|
||||
fields: VespaDocumentFields | None,
|
||||
user_fields: VespaDocumentUserFields | None,
|
||||
) -> int:
|
||||
"""Note: if the document id does not exist, the update will be a no-op and the
|
||||
function will complete with no errors or exceptions.
|
||||
@ -682,7 +697,12 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
for doc_chunk_id in doc_chunk_ids:
|
||||
self._update_single_chunk(
|
||||
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
||||
doc_chunk_id,
|
||||
index_name,
|
||||
fields,
|
||||
user_fields,
|
||||
doc_id,
|
||||
httpx_client,
|
||||
)
|
||||
|
||||
return doc_chunk_count
|
||||
@ -723,6 +743,7 @@ class VespaIndex(DocumentIndex):
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=large_chunks_enabled,
|
||||
)
|
||||
|
||||
for doc_chunk_ids_batch in batch_generator(
|
||||
chunks_to_delete, BATCH_SIZE
|
||||
):
|
||||
|
@ -51,6 +51,8 @@ from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -205,6 +207,8 @@ def _index_vespa_chunk(
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
|
||||
IMAGE_FILE_NAME: chunk.image_file_name,
|
||||
USER_FILE: chunk.user_file if chunk.user_file is not None else None,
|
||||
USER_FOLDER: chunk.user_folder if chunk.user_folder is not None else None,
|
||||
BOOST: chunk.boost,
|
||||
AGGREGATED_CHUNK_BOOST_FACTOR: chunk.aggregated_chunk_boost_factor,
|
||||
}
|
||||
|
@ -14,6 +14,8 @@ from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
from onyx.document_index.vespa_constants import SOURCE_TYPE
|
||||
from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import USER_FILE
|
||||
from onyx.document_index.vespa_constants import USER_FOLDER
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
@ -27,14 +29,26 @@ def build_vespa_filters(
|
||||
remove_trailing_and: bool = False, # Set to True when using as a complete Vespa query
|
||||
) -> str:
|
||||
def _build_or_filters(key: str, vals: list[str] | None) -> str:
|
||||
if vals is None:
|
||||
"""For string-based 'contains' filters, e.g. WSET fields or array<string> fields."""
|
||||
if not key or not vals:
|
||||
return ""
|
||||
eq_elems = [f'{key} contains "{val}"' for val in vals if val]
|
||||
if not eq_elems:
|
||||
return ""
|
||||
or_clause = " or ".join(eq_elems)
|
||||
return f"({or_clause}) and "
|
||||
|
||||
def _build_int_or_filters(key: str, vals: list[int] | None) -> str:
|
||||
"""
|
||||
For an integer field filter.
|
||||
If vals is not None, we want *only* docs whose key matches one of vals.
|
||||
"""
|
||||
# If `vals` is None => skip the filter entirely
|
||||
if vals is None or not vals:
|
||||
return ""
|
||||
|
||||
valid_vals = [val for val in vals if val]
|
||||
if not key or not valid_vals:
|
||||
return ""
|
||||
|
||||
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
|
||||
# Otherwise build the OR filter
|
||||
eq_elems = [f"{key} = {val}" for val in vals]
|
||||
or_clause = " or ".join(eq_elems)
|
||||
result = f"({or_clause}) and "
|
||||
|
||||
@ -42,53 +56,59 @@ def build_vespa_filters(
|
||||
|
||||
def _build_time_filter(
|
||||
cutoff: datetime | None,
|
||||
# Slightly over 3 Months, approximately 1 fiscal quarter
|
||||
untimed_doc_cutoff: timedelta = timedelta(days=92),
|
||||
) -> str:
|
||||
if not cutoff:
|
||||
return ""
|
||||
|
||||
# For Documents that don't have an updated at, filter them out for queries asking for
|
||||
# very recent documents (3 months) default. Documents that don't have an updated at
|
||||
# time are assigned 3 months for time decay value
|
||||
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
|
||||
cutoff_secs = int(cutoff.timestamp())
|
||||
|
||||
if include_untimed:
|
||||
# Documents without updated_at are assigned -1 as their date
|
||||
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
|
||||
|
||||
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
|
||||
|
||||
# Start building the filter string
|
||||
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
|
||||
|
||||
# If running in multi-tenant mode, we may want to filter by tenant_id
|
||||
# If running in multi-tenant mode
|
||||
if filters.tenant_id and MULTI_TENANT:
|
||||
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
|
||||
|
||||
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
|
||||
# ACL filters
|
||||
if filters.access_control_list is not None:
|
||||
filter_str += _build_or_filters(
|
||||
ACCESS_CONTROL_LIST, filters.access_control_list
|
||||
)
|
||||
|
||||
# Source type filters
|
||||
source_strs = (
|
||||
[s.value for s in filters.source_type] if filters.source_type else None
|
||||
)
|
||||
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
|
||||
|
||||
# Tag filters
|
||||
tag_attributes = None
|
||||
tags = filters.tags
|
||||
if tags:
|
||||
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
|
||||
if filters.tags:
|
||||
# build e.g. "tag_key|tag_value"
|
||||
tag_attributes = [
|
||||
f"{tag.tag_key}{INDEX_SEPARATOR}{tag.tag_value}" for tag in filters.tags
|
||||
]
|
||||
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
|
||||
|
||||
# Document sets
|
||||
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
|
||||
|
||||
# New: user_file_ids as integer filters
|
||||
filter_str += _build_int_or_filters(USER_FILE, filters.user_file_ids)
|
||||
|
||||
filter_str += _build_int_or_filters(USER_FOLDER, filters.user_folder_ids)
|
||||
|
||||
# Time filter
|
||||
filter_str += _build_time_filter(filters.time_cutoff)
|
||||
|
||||
# Trim trailing " and "
|
||||
if remove_trailing_and and filter_str.endswith(" and "):
|
||||
filter_str = filter_str[:-5] # We remove the trailing " and "
|
||||
filter_str = filter_str[:-5]
|
||||
|
||||
return filter_str
|
||||
|
||||
|
@ -67,6 +67,8 @@ EMBEDDINGS = "embeddings"
|
||||
TITLE_EMBEDDING = "title_embedding"
|
||||
ACCESS_CONTROL_LIST = "access_control_list"
|
||||
DOCUMENT_SETS = "document_sets"
|
||||
USER_FILE = "user_file"
|
||||
USER_FOLDER = "user_folder"
|
||||
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
|
||||
METADATA = "metadata"
|
||||
METADATA_LIST = "metadata_list"
|
||||
|
Reference in New Issue
Block a user