diff --git a/backend/open_webui/retrieval/vector/dbs/chroma.py b/backend/open_webui/retrieval/vector/dbs/chroma.py index 9675e141e7..1fdb064c51 100755 --- a/backend/open_webui/retrieval/vector/dbs/chroma.py +++ b/backend/open_webui/retrieval/vector/dbs/chroma.py @@ -11,7 +11,7 @@ from open_webui.retrieval.vector.main import ( SearchResult, GetResult, ) -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.config import ( CHROMA_DATA_PATH, @@ -146,7 +146,7 @@ class ChromaClient(VectorDBBase): ids = [item["id"] for item in items] documents = [item["text"] for item in items] embeddings = [item["vector"] for item in items] - metadatas = [stringify_metadata(item["metadata"]) for item in items] + metadatas = [process_metadata(item["metadata"]) for item in items] for batch in create_batches( api=self.client, @@ -166,7 +166,7 @@ class ChromaClient(VectorDBBase): ids = [item["id"] for item in items] documents = [item["text"] for item in items] embeddings = [item["vector"] for item in items] - metadatas = [stringify_metadata(item["metadata"]) for item in items] + metadatas = [process_metadata(item["metadata"]) for item in items] collection.upsert( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas diff --git a/backend/open_webui/retrieval/vector/dbs/elasticsearch.py b/backend/open_webui/retrieval/vector/dbs/elasticsearch.py index 727d831cff..6de0d859f8 100644 --- a/backend/open_webui/retrieval/vector/dbs/elasticsearch.py +++ b/backend/open_webui/retrieval/vector/dbs/elasticsearch.py @@ -3,7 +3,7 @@ from typing import Optional import ssl from elasticsearch.helpers import bulk, scan -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -245,7 +245,7 @@ class ElasticsearchClient(VectorDBBase): "collection": collection_name, "vector": item["vector"], "text": item["text"], - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), }, } for item in batch @@ -266,7 +266,7 @@ class ElasticsearchClient(VectorDBBase): "collection": collection_name, "vector": item["vector"], "text": item["text"], - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), }, "doc_as_upsert": True, } diff --git a/backend/open_webui/retrieval/vector/dbs/milvus.py b/backend/open_webui/retrieval/vector/dbs/milvus.py index f7bd30cbd7..98f8e335f2 100644 --- a/backend/open_webui/retrieval/vector/dbs/milvus.py +++ b/backend/open_webui/retrieval/vector/dbs/milvus.py @@ -6,7 +6,7 @@ import json import logging from typing import Optional -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -289,7 +289,7 @@ class MilvusClient(VectorDBBase): "id": item["id"], "vector": item["vector"], "data": {"text": item["text"]}, - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), } for item in items ], @@ -325,7 +325,7 @@ class MilvusClient(VectorDBBase): "id": item["id"], "vector": item["vector"], "data": {"text": item["text"]}, - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), } for item in items ], diff --git a/backend/open_webui/retrieval/vector/dbs/opensearch.py b/backend/open_webui/retrieval/vector/dbs/opensearch.py index 510070f97a..2e946710e2 100644 --- a/backend/open_webui/retrieval/vector/dbs/opensearch.py +++ b/backend/open_webui/retrieval/vector/dbs/opensearch.py @@ -2,7 +2,7 @@ from opensearchpy import OpenSearch from opensearchpy.helpers import bulk from typing import Optional -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -201,7 +201,7 @@ class OpenSearchClient(VectorDBBase): "_source": { "vector": item["vector"], "text": item["text"], - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), }, } for item in batch @@ -223,7 +223,7 @@ class OpenSearchClient(VectorDBBase): "doc": { "vector": item["vector"], "text": item["text"], - "metadata": stringify_metadata(item["metadata"]), + "metadata": process_metadata(item["metadata"]), }, "doc_as_upsert": True, } diff --git a/backend/open_webui/retrieval/vector/dbs/pgvector.py b/backend/open_webui/retrieval/vector/dbs/pgvector.py index 06c1698cdd..312b48944c 100644 --- a/backend/open_webui/retrieval/vector/dbs/pgvector.py +++ b/backend/open_webui/retrieval/vector/dbs/pgvector.py @@ -27,7 +27,7 @@ from sqlalchemy.ext.mutable import MutableDict from sqlalchemy.exc import NoSuchTableError -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -265,7 +265,7 @@ class PgvectorClient(VectorDBBase): vector=vector, collection_name=collection_name, text=item["text"], - vmetadata=stringify_metadata(item["metadata"]), + vmetadata=process_metadata(item["metadata"]), ) new_items.append(new_chunk) self.session.bulk_save_objects(new_items) @@ -323,7 +323,7 @@ class PgvectorClient(VectorDBBase): if existing: existing.vector = vector existing.text = item["text"] - existing.vmetadata = stringify_metadata(item["metadata"]) + existing.vmetadata = process_metadata(item["metadata"]) existing.collection_name = ( collection_name # Update collection_name if necessary ) @@ -333,7 +333,7 @@ class PgvectorClient(VectorDBBase): vector=vector, collection_name=collection_name, text=item["text"], - vmetadata=stringify_metadata(item["metadata"]), + vmetadata=process_metadata(item["metadata"]), ) self.session.add(new_chunk) self.session.commit() diff --git a/backend/open_webui/retrieval/vector/dbs/pinecone.py b/backend/open_webui/retrieval/vector/dbs/pinecone.py index 466b5a6e24..5bef0d9ea7 100644 --- a/backend/open_webui/retrieval/vector/dbs/pinecone.py +++ b/backend/open_webui/retrieval/vector/dbs/pinecone.py @@ -32,7 +32,7 @@ from open_webui.config import ( PINECONE_CLOUD, ) from open_webui.env import SRC_LOG_LEVELS -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata NO_LIMIT = 10000 # Reasonable limit to avoid overwhelming the system @@ -185,7 +185,7 @@ class PineconeClient(VectorDBBase): point = { "id": item["id"], "values": item["vector"], - "metadata": stringify_metadata(metadata), + "metadata": process_metadata(metadata), } points.append(point) return points diff --git a/backend/open_webui/retrieval/vector/dbs/s3vector.py b/backend/open_webui/retrieval/vector/dbs/s3vector.py index 2ac6911769..519ee5abad 100644 --- a/backend/open_webui/retrieval/vector/dbs/s3vector.py +++ b/backend/open_webui/retrieval/vector/dbs/s3vector.py @@ -1,4 +1,4 @@ -from open_webui.retrieval.vector.utils import stringify_metadata +from open_webui.retrieval.vector.utils import process_metadata from open_webui.retrieval.vector.main import ( VectorDBBase, VectorItem, @@ -185,7 +185,7 @@ class S3VectorClient(VectorDBBase): metadata["text"] = item["text"] # Convert metadata to string format for consistency - metadata = stringify_metadata(metadata) + metadata = process_metadata(metadata) # Filter metadata to comply with S3 Vector API limit of 10 keys metadata = self._filter_metadata(metadata, item["id"]) @@ -256,7 +256,7 @@ class S3VectorClient(VectorDBBase): metadata["text"] = item["text"] # Convert metadata to string format for consistency - metadata = stringify_metadata(metadata) + metadata = process_metadata(metadata) # Filter metadata to comply with S3 Vector API limit of 10 keys metadata = self._filter_metadata(metadata, item["id"]) diff --git a/backend/open_webui/retrieval/vector/utils.py b/backend/open_webui/retrieval/vector/utils.py index 1d9698c6b1..a597390b92 100644 --- a/backend/open_webui/retrieval/vector/utils.py +++ b/backend/open_webui/retrieval/vector/utils.py @@ -1,10 +1,24 @@ from datetime import datetime +KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"] -def stringify_metadata( + +def filter_metadata(metadata: dict[str, any]) -> dict[str, any]: + metadata = { + key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE + } + return metadata + + +def process_metadata( metadata: dict[str, any], ) -> dict[str, any]: for key, value in metadata.items(): + # Remove large fields + if key in KEYS_TO_EXCLUDE: + del metadata[key] + + # Convert non-serializable fields to strings if ( isinstance(value, datetime) or isinstance(value, list) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 3681008c87..d322addfa6 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -78,6 +78,7 @@ from open_webui.retrieval.utils import ( query_doc, query_doc_with_hybrid_search, ) +from open_webui.retrieval.vector.utils import filter_metadata from open_webui.utils.misc import ( calculate_sha256_string, ) @@ -1535,7 +1536,7 @@ def process_file( Document( page_content=doc.page_content, metadata={ - **doc.metadata, + **filter_metadata(doc.metadata), "name": file.filename, "created_by": file.user_id, "file_id": file.id,