Add image support for search (#4090)

* add support for image search * quick fix up * k * k * k * k * nit * quick fix for connector tests
2025-07-03 11:11:45 +02:00 · 2025-03-05 09:44:18 -08:00
parent f731beca1f
commit 20f2b9b2bb
36 changed files with 1857 additions and 589 deletions
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -55,6 +55,9 @@ schema DANSWER_CHUNK_NAME {
        field blurb type string {
            indexing: summary | attribute
        }
+        field image_file_name type string {
+            indexing: summary | attribute
+        }
        # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
        field source_type type string {
            indexing: summary | attribute
--- a/backend/onyx/document_index/vespa/chunk_retrieval.py
+++ b/backend/onyx/document_index/vespa/chunk_retrieval.py
@ -31,6 +31,7 @@ from onyx.document_index.vespa_constants import DOC_UPDATED_AT
 from onyx.document_index.vespa_constants import DOCUMENT_ID
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
 from onyx.document_index.vespa_constants import HIDDEN
+from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
 from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
 from onyx.document_index.vespa_constants import MAX_ID_SEARCH_QUERY_SIZE
 from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
@ -130,6 +131,7 @@ def _vespa_hit_to_inference_chunk(
        section_continuation=fields[SECTION_CONTINUATION],
        document_id=fields[DOCUMENT_ID],
        source_type=fields[SOURCE_TYPE],
+        image_file_name=fields.get(IMAGE_FILE_NAME),
        title=fields.get(TITLE),
        semantic_identifier=fields[SEMANTIC_IDENTIFIER],
        boost=fields.get(BOOST, 1),
@ -211,6 +213,7 @@ def _get_chunks_via_visit_api(

        # Check if the response contains any documents
        response_data = response.json()
+
        if "documents" in response_data:
            for document in response_data["documents"]:
                if filters.access_control_list:
--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@ -32,6 +32,7 @@ from onyx.document_index.vespa_constants import DOCUMENT_ID
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
 from onyx.document_index.vespa_constants import DOCUMENT_SETS
 from onyx.document_index.vespa_constants import EMBEDDINGS
+from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
 from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
 from onyx.document_index.vespa_constants import METADATA
 from onyx.document_index.vespa_constants import METADATA_LIST
@ -198,13 +199,13 @@ def _index_vespa_chunk(
        # which only calls VespaIndex.update
        ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
        DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
+        IMAGE_FILE_NAME: chunk.image_file_name,
        BOOST: chunk.boost,
    }

    if multitenant:
        if chunk.tenant_id:
            vespa_document_fields[TENANT_ID] = chunk.tenant_id
-
    vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}"
    logger.debug(f'Indexing to URL "{vespa_url}"')
    res = http_client.post(
--- a/backend/onyx/document_index/vespa_constants.py
+++ b/backend/onyx/document_index/vespa_constants.py
@ -77,6 +77,7 @@ PRIMARY_OWNERS = "primary_owners"
 SECONDARY_OWNERS = "secondary_owners"
 RECENCY_BIAS = "recency_bias"
 HIDDEN = "hidden"
+IMAGE_FILE_NAME = "image_file_name"

 # Specific to Vespa, needed for highlighting matching keywords / section
 CONTENT_SUMMARY = "content_summary"
@ -94,6 +95,7 @@ YQL_BASE = (
    f"{SEMANTIC_IDENTIFIER}, "
    f"{TITLE}, "
    f"{SECTION_CONTINUATION}, "
+    f"{IMAGE_FILE_NAME}, "
    f"{BOOST}, "
    f"{HIDDEN}, "
    f"{DOC_UPDATED_AT}, "