add user files (#4152)

2025-07-08 13:40:46 +02:00 · 2025-03-31 14:06:59 -07:00
parent ccd372cc4a
commit 3a3b2a2f8d
166 changed files with 12892 additions and 1048 deletions
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@ -104,6 +104,16 @@ class VespaDocumentFields:
    aggregated_chunk_boost_factor: float | None = None


+@dataclass
+class VespaDocumentUserFields:
+    """
+    Fields that are specific to the user who is indexing the document.
+    """
+
+    user_file_id: str | None = None
+    user_folder_id: str | None = None
+
+
@dataclass
 class UpdateRequest:
    """
@ -258,7 +268,8 @@ class Updatable(abc.ABC):
        *,
        tenant_id: str,
        chunk_count: int | None,
-        fields: VespaDocumentFields,
+        fields: VespaDocumentFields | None,
+        user_fields: VespaDocumentUserFields | None,
    ) -> int:
        """
        Updates all chunks for a document with the specified fields.
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -120,12 +120,22 @@ schema DANSWER_CHUNK_NAME {
            indexing: summary | attribute
            rank: filter
            attribute: fast-search
-        }
+        } 
        field document_sets type weightedset<string> {
            indexing: summary | attribute
            rank: filter
            attribute: fast-search
        }
+        field user_file type int {
+            indexing: summary | attribute
+            rank: filter
+            attribute: fast-search
+        }
+        field user_folder type int {
+            indexing: summary | attribute
+            rank: filter
+            attribute: fast-search
+        }
    }

    # If using different tokenization settings, the fieldset has to be removed, and the field must
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -36,6 +36,7 @@ from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
 from onyx.document_index.interfaces import UpdateRequest
 from onyx.document_index.interfaces import VespaChunkRequest
 from onyx.document_index.interfaces import VespaDocumentFields
+from onyx.document_index.interfaces import VespaDocumentUserFields
 from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval
 from onyx.document_index.vespa.chunk_retrieval import (
    parallel_visit_api_retrieval,
@ -70,6 +71,8 @@ from onyx.document_index.vespa_constants import NUM_THREADS
 from onyx.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT
 from onyx.document_index.vespa_constants import TENANT_ID_PAT
 from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
+from onyx.document_index.vespa_constants import USER_FILE
+from onyx.document_index.vespa_constants import USER_FOLDER
 from onyx.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT
 from onyx.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT
 from onyx.document_index.vespa_constants import VESPA_TIMEOUT
@ -592,7 +595,8 @@ class VespaIndex(DocumentIndex):
        self,
        doc_chunk_id: UUID,
        index_name: str,
-        fields: VespaDocumentFields,
+        fields: VespaDocumentFields | None,
+        user_fields: VespaDocumentUserFields | None,
        doc_id: str,
        http_client: httpx.Client,
    ) -> None:
@ -603,21 +607,31 @@ class VespaIndex(DocumentIndex):

        update_dict: dict[str, dict] = {"fields": {}}

-        if fields.boost is not None:
-            update_dict["fields"][BOOST] = {"assign": fields.boost}
+        if fields is not None:
+            if fields.boost is not None:
+                update_dict["fields"][BOOST] = {"assign": fields.boost}

-        if fields.document_sets is not None:
-            update_dict["fields"][DOCUMENT_SETS] = {
-                "assign": {document_set: 1 for document_set in fields.document_sets}
-            }
+            if fields.document_sets is not None:
+                update_dict["fields"][DOCUMENT_SETS] = {
+                    "assign": {document_set: 1 for document_set in fields.document_sets}
+                }

-        if fields.access is not None:
-            update_dict["fields"][ACCESS_CONTROL_LIST] = {
-                "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
-            }
+            if fields.access is not None:
+                update_dict["fields"][ACCESS_CONTROL_LIST] = {
+                    "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
+                }

-        if fields.hidden is not None:
-            update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
+            if fields.hidden is not None:
+                update_dict["fields"][HIDDEN] = {"assign": fields.hidden}
+
+        if user_fields is not None:
+            if user_fields.user_file_id is not None:
+                update_dict["fields"][USER_FILE] = {"assign": user_fields.user_file_id}
+
+            if user_fields.user_folder_id is not None:
+                update_dict["fields"][USER_FOLDER] = {
+                    "assign": user_fields.user_folder_id
+                }

        if not update_dict["fields"]:
            logger.error("Update request received but nothing to update.")
@ -649,7 +663,8 @@ class VespaIndex(DocumentIndex):
        *,
        chunk_count: int | None,
        tenant_id: str,
-        fields: VespaDocumentFields,
+        fields: VespaDocumentFields | None,
+        user_fields: VespaDocumentUserFields | None,
    ) -> int:
        """Note: if the document id does not exist, the update will be a no-op and the
        function will complete with no errors or exceptions.
@ -682,7 +697,12 @@ class VespaIndex(DocumentIndex):

                for doc_chunk_id in doc_chunk_ids:
                    self._update_single_chunk(
-                        doc_chunk_id, index_name, fields, doc_id, httpx_client
+                        doc_chunk_id,
+                        index_name,
+                        fields,
+                        user_fields,
+                        doc_id,
+                        httpx_client,
                    )

        return doc_chunk_count
@ -723,6 +743,7 @@ class VespaIndex(DocumentIndex):
                    tenant_id=tenant_id,
                    large_chunks_enabled=large_chunks_enabled,
                )
+
                for doc_chunk_ids_batch in batch_generator(
                    chunks_to_delete, BATCH_SIZE
                ):
--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@ -51,6 +51,8 @@ from onyx.document_index.vespa_constants import SOURCE_TYPE
 from onyx.document_index.vespa_constants import TENANT_ID
 from onyx.document_index.vespa_constants import TITLE
 from onyx.document_index.vespa_constants import TITLE_EMBEDDING
+from onyx.document_index.vespa_constants import USER_FILE
+from onyx.document_index.vespa_constants import USER_FOLDER
 from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.utils.logger import setup_logger

@ -205,6 +207,8 @@ def _index_vespa_chunk(
        ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
        DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
        IMAGE_FILE_NAME: chunk.image_file_name,
+        USER_FILE: chunk.user_file if chunk.user_file is not None else None,
+        USER_FOLDER: chunk.user_folder if chunk.user_folder is not None else None,
        BOOST: chunk.boost,
        AGGREGATED_CHUNK_BOOST_FACTOR: chunk.aggregated_chunk_boost_factor,
    }
--- a/backend/onyx/document_index/vespa/shared_utils/vespa_request_builders.py
+++ b/backend/onyx/document_index/vespa/shared_utils/vespa_request_builders.py
@ -14,6 +14,8 @@ from onyx.document_index.vespa_constants import HIDDEN
 from onyx.document_index.vespa_constants import METADATA_LIST
 from onyx.document_index.vespa_constants import SOURCE_TYPE
 from onyx.document_index.vespa_constants import TENANT_ID
+from onyx.document_index.vespa_constants import USER_FILE
+from onyx.document_index.vespa_constants import USER_FOLDER
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT

@ -27,14 +29,26 @@ def build_vespa_filters(
    remove_trailing_and: bool = False,  # Set to True when using as a complete Vespa query
 ) -> str:
    def _build_or_filters(key: str, vals: list[str] | None) -> str:
-        if vals is None:
+        """For string-based 'contains' filters, e.g. WSET fields or array<string> fields."""
+        if not key or not vals:
+            return ""
+        eq_elems = [f'{key} contains "{val}"' for val in vals if val]
+        if not eq_elems:
+            return ""
+        or_clause = " or ".join(eq_elems)
+        return f"({or_clause}) and "
+
+    def _build_int_or_filters(key: str, vals: list[int] | None) -> str:
+        """
+        For an integer field filter.
+        If vals is not None, we want *only* docs whose key matches one of vals.
+        """
+        # If `vals` is None => skip the filter entirely
+        if vals is None or not vals:
            return ""

-        valid_vals = [val for val in vals if val]
-        if not key or not valid_vals:
-            return ""
-
-        eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
+        # Otherwise build the OR filter
+        eq_elems = [f"{key} = {val}" for val in vals]
        or_clause = " or ".join(eq_elems)
        result = f"({or_clause}) and "

@ -42,53 +56,59 @@ def build_vespa_filters(

    def _build_time_filter(
        cutoff: datetime | None,
-        # Slightly over 3 Months, approximately 1 fiscal quarter
        untimed_doc_cutoff: timedelta = timedelta(days=92),
    ) -> str:
        if not cutoff:
            return ""
-
-        # For Documents that don't have an updated at, filter them out for queries asking for
-        # very recent documents (3 months) default. Documents that don't have an updated at
-        # time are assigned 3 months for time decay value
        include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
        cutoff_secs = int(cutoff.timestamp())

        if include_untimed:
-            # Documents without updated_at are assigned -1 as their date
            return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
-
        return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "

+    # Start building the filter string
    filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""

-    # If running in multi-tenant mode, we may want to filter by tenant_id
+    # If running in multi-tenant mode
    if filters.tenant_id and MULTI_TENANT:
        filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '

-    # CAREFUL touching this one, currently there is no second ACL double-check post retrieval
+    # ACL filters
    if filters.access_control_list is not None:
        filter_str += _build_or_filters(
            ACCESS_CONTROL_LIST, filters.access_control_list
        )

+    # Source type filters
    source_strs = (
        [s.value for s in filters.source_type] if filters.source_type else None
    )
    filter_str += _build_or_filters(SOURCE_TYPE, source_strs)

+    # Tag filters
    tag_attributes = None
-    tags = filters.tags
-    if tags:
-        tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
+    if filters.tags:
+        # build e.g. "tag_key|tag_value"
+        tag_attributes = [
+            f"{tag.tag_key}{INDEX_SEPARATOR}{tag.tag_value}" for tag in filters.tags
+        ]
    filter_str += _build_or_filters(METADATA_LIST, tag_attributes)

+    # Document sets
    filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)

+    # New: user_file_ids as integer filters
+    filter_str += _build_int_or_filters(USER_FILE, filters.user_file_ids)
+
+    filter_str += _build_int_or_filters(USER_FOLDER, filters.user_folder_ids)
+
+    # Time filter
    filter_str += _build_time_filter(filters.time_cutoff)

+    # Trim trailing " and "
    if remove_trailing_and and filter_str.endswith(" and "):
-        filter_str = filter_str[:-5]  # We remove the trailing " and "
+        filter_str = filter_str[:-5]

    return filter_str

--- a/backend/onyx/document_index/vespa_constants.py
+++ b/backend/onyx/document_index/vespa_constants.py
@ -67,6 +67,8 @@ EMBEDDINGS = "embeddings"
 TITLE_EMBEDDING = "title_embedding"
 ACCESS_CONTROL_LIST = "access_control_list"
 DOCUMENT_SETS = "document_sets"
+USER_FILE = "user_file"
+USER_FOLDER = "user_folder"
 LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
 METADATA = "metadata"
 METADATA_LIST = "metadata_list"