add user files (#4152)

This commit is contained in:
pablonyx
2025-03-31 14:06:59 -07:00
committed by Weves
parent ccd372cc4a
commit 3a3b2a2f8d
166 changed files with 12892 additions and 1048 deletions

View File

@ -104,6 +104,16 @@ class VespaDocumentFields:
aggregated_chunk_boost_factor: float | None = None
@dataclass
class VespaDocumentUserFields:
"""
Fields that are specific to the user who is indexing the document.
"""
user_file_id: str | None = None
user_folder_id: str | None = None
@dataclass
class UpdateRequest:
"""
@ -258,7 +268,8 @@ class Updatable(abc.ABC):
*,
tenant_id: str,
chunk_count: int | None,
fields: VespaDocumentFields,
fields: VespaDocumentFields | None,
user_fields: VespaDocumentUserFields | None,
) -> int:
"""
Updates all chunks for a document with the specified fields.

View File

@ -120,12 +120,22 @@ schema DANSWER_CHUNK_NAME {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
field document_sets type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field user_file type int {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field user_folder type int {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
# If using different tokenization settings, the fieldset has to be removed, and the field must

View File

@ -36,6 +36,7 @@ from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
from onyx.document_index.interfaces import UpdateRequest
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.interfaces import VespaDocumentUserFields
from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
from onyx.document_index.vespa.chunk_retrieval import (
parallel_visit_api_retrieval,
@ -70,6 +71,8 @@ from onyx.document_index.vespa_constants import NUM_THREADS
from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
from onyx.document_index.vespa_constants import TENANT_ID_PAT
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
from onyx.document_index.vespa_constants import USER_FILE
from onyx.document_index.vespa_constants import USER_FOLDER
from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
from onyx.document_index.vespa_constants import VESPA_TIMEOUT
@ -592,7 +595,8 @@ class VespaIndex(DocumentIndex):
self,
doc_chunk_id: UUID,
index_name: str,
fields: VespaDocumentFields,
fields: VespaDocumentFields | None,
user_fields: VespaDocumentUserFields | None,
doc_id: str,
http_client: httpx.Client,
) -> None:
@ -603,21 +607,31 @@ class VespaIndex(DocumentIndex):
update_dict: dict[str, dict] = {"fields": {}}
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields is not None:
if fields.boost is not None:
update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.document_sets is not None:
update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets}
}
if fields.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.access is not None:
update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if fields.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
if user_fields is not None:
if user_fields.user_file_id is not None:
update_dict["fields"][USER_FILE] = {"assign": user_fields.user_file_id}
if user_fields.user_folder_id is not None:
update_dict["fields"][USER_FOLDER] = {
"assign": user_fields.user_folder_id
}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update.")
@ -649,7 +663,8 @@ class VespaIndex(DocumentIndex):
*,
chunk_count: int | None,
tenant_id: str,
fields: VespaDocumentFields,
fields: VespaDocumentFields | None,
user_fields: VespaDocumentUserFields | None,
) -> int:
"""Note: if the document id does not exist, the update will be a no-op and the
function will complete with no errors or exceptions.
@ -682,7 +697,12 @@ class VespaIndex(DocumentIndex):
for doc_chunk_id in doc_chunk_ids:
self._update_single_chunk(
doc_chunk_id, index_name, fields, doc_id, httpx_client
doc_chunk_id,
index_name,
fields,
user_fields,
doc_id,
httpx_client,
)
return doc_chunk_count
@ -723,6 +743,7 @@ class VespaIndex(DocumentIndex):
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
)
for doc_chunk_ids_batch in batch_generator(
chunks_to_delete, BATCH_SIZE
):

View File

@ -51,6 +51,8 @@ from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.document_index.vespa_constants import USER_FILE
from onyx.document_index.vespa_constants import USER_FOLDER
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.utils.logger import setup_logger
@ -205,6 +207,8 @@ def _index_vespa_chunk(
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
IMAGE_FILE_NAME: chunk.image_file_name,
USER_FILE: chunk.user_file if chunk.user_file is not None else None,
USER_FOLDER: chunk.user_folder if chunk.user_folder is not None else None,
BOOST: chunk.boost,
AGGREGATED_CHUNK_BOOST_FACTOR: chunk.aggregated_chunk_boost_factor,
}

View File

@ -14,6 +14,8 @@ from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import METADATA_LIST
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import USER_FILE
from onyx.document_index.vespa_constants import USER_FOLDER
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
@ -27,14 +29,26 @@ def build_vespa_filters(
remove_trailing_and: bool = False, # Set to True when using as a complete Vespa query
) -> str:
def _build_or_filters(key: str, vals: list[str] | None) -> str:
if vals is None:
"""For string-based 'contains' filters, e.g. WSET fields or array<string> fields."""
if not key or not vals:
return ""
eq_elems = [f'{key} contains "{val}"' for val in vals if val]
if not eq_elems:
return ""
or_clause = " or ".join(eq_elems)
return f"({or_clause}) and "
def _build_int_or_filters(key: str, vals: list[int] | None) -> str:
"""
For an integer field filter.
If vals is not None, we want *only* docs whose key matches one of vals.
"""
# If `vals` is None => skip the filter entirely
if vals is None or not vals:
return ""
valid_vals = [val for val in vals if val]
if not key or not valid_vals:
return ""
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
# Otherwise build the OR filter
eq_elems = [f"{key} = {val}" for val in vals]
or_clause = " or ".join(eq_elems)
result = f"({or_clause}) and "
@ -42,53 +56,59 @@ def build_vespa_filters(
def _build_time_filter(
cutoff: datetime | None,
# Slightly over 3 Months, approximately 1 fiscal quarter
untimed_doc_cutoff: timedelta = timedelta(days=92),
) -> str:
if not cutoff:
return ""
# For Documents that don't have an updated at, filter them out for queries asking for
# very recent documents (3 months) default. Documents that don't have an updated at
# time are assigned 3 months for time decay value
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
cutoff_secs = int(cutoff.timestamp())
if include_untimed:
# Documents without updated_at are assigned -1 as their date
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
# Start building the filter string
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
# If running in multi-tenant mode, we may want to filter by tenant_id
# If running in multi-tenant mode
if filters.tenant_id and MULTI_TENANT:
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
# ACL filters
if filters.access_control_list is not None:
filter_str += _build_or_filters(
ACCESS_CONTROL_LIST, filters.access_control_list
)
# Source type filters
source_strs = (
[s.value for s in filters.source_type] if filters.source_type else None
)
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
# Tag filters
tag_attributes = None
tags = filters.tags
if tags:
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
if filters.tags:
# build e.g. "tag_key|tag_value"
tag_attributes = [
f"{tag.tag_key}{INDEX_SEPARATOR}{tag.tag_value}" for tag in filters.tags
]
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
# Document sets
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
# New: user_file_ids as integer filters
filter_str += _build_int_or_filters(USER_FILE, filters.user_file_ids)
filter_str += _build_int_or_filters(USER_FOLDER, filters.user_folder_ids)
# Time filter
filter_str += _build_time_filter(filters.time_cutoff)
# Trim trailing " and "
if remove_trailing_and and filter_str.endswith(" and "):
filter_str = filter_str[:-5] # We remove the trailing " and "
filter_str = filter_str[:-5]
return filter_str

View File

@ -67,6 +67,8 @@ EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"
ACCESS_CONTROL_LIST = "access_control_list"
DOCUMENT_SETS = "document_sets"
USER_FILE = "user_file"
USER_FOLDER = "user_folder"
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
METADATA = "metadata"
METADATA_LIST = "metadata_list"