enh/fix: filter content metadata

This commit is contained in:
Timothy Jaeryang Baek
2025-09-28 20:17:27 -05:00
parent 1c418a7f83
commit 118549caf3
9 changed files with 38 additions and 23 deletions

View File

@@ -11,7 +11,7 @@ from open_webui.retrieval.vector.main import (
SearchResult,
GetResult,
)
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.config import (
CHROMA_DATA_PATH,
@@ -146,7 +146,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items]
documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items]
metadatas = [stringify_metadata(item["metadata"]) for item in items]
metadatas = [process_metadata(item["metadata"]) for item in items]
for batch in create_batches(
api=self.client,
@@ -166,7 +166,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items]
documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items]
metadatas = [stringify_metadata(item["metadata"]) for item in items]
metadatas = [process_metadata(item["metadata"]) for item in items]
collection.upsert(
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas

View File

@@ -3,7 +3,7 @@ from typing import Optional
import ssl
from elasticsearch.helpers import bulk, scan
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@@ -245,7 +245,7 @@ class ElasticsearchClient(VectorDBBase):
"collection": collection_name,
"vector": item["vector"],
"text": item["text"],
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
},
}
for item in batch
@@ -266,7 +266,7 @@ class ElasticsearchClient(VectorDBBase):
"collection": collection_name,
"vector": item["vector"],
"text": item["text"],
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
},
"doc_as_upsert": True,
}

View File

@@ -6,7 +6,7 @@ import json
import logging
from typing import Optional
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@@ -289,7 +289,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"],
"vector": item["vector"],
"data": {"text": item["text"]},
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
}
for item in items
],
@@ -325,7 +325,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"],
"vector": item["vector"],
"data": {"text": item["text"]},
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
}
for item in items
],

View File

@@ -2,7 +2,7 @@ from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
from typing import Optional
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@@ -201,7 +201,7 @@ class OpenSearchClient(VectorDBBase):
"_source": {
"vector": item["vector"],
"text": item["text"],
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
},
}
for item in batch
@@ -223,7 +223,7 @@ class OpenSearchClient(VectorDBBase):
"doc": {
"vector": item["vector"],
"text": item["text"],
"metadata": stringify_metadata(item["metadata"]),
"metadata": process_metadata(item["metadata"]),
},
"doc_as_upsert": True,
}

View File

@@ -27,7 +27,7 @@ from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.exc import NoSuchTableError
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@@ -265,7 +265,7 @@ class PgvectorClient(VectorDBBase):
vector=vector,
collection_name=collection_name,
text=item["text"],
vmetadata=stringify_metadata(item["metadata"]),
vmetadata=process_metadata(item["metadata"]),
)
new_items.append(new_chunk)
self.session.bulk_save_objects(new_items)
@@ -323,7 +323,7 @@ class PgvectorClient(VectorDBBase):
if existing:
existing.vector = vector
existing.text = item["text"]
existing.vmetadata = stringify_metadata(item["metadata"])
existing.vmetadata = process_metadata(item["metadata"])
existing.collection_name = (
collection_name # Update collection_name if necessary
)
@@ -333,7 +333,7 @@ class PgvectorClient(VectorDBBase):
vector=vector,
collection_name=collection_name,
text=item["text"],
vmetadata=stringify_metadata(item["metadata"]),
vmetadata=process_metadata(item["metadata"]),
)
self.session.add(new_chunk)
self.session.commit()

View File

@@ -32,7 +32,7 @@ from open_webui.config import (
PINECONE_CLOUD,
)
from open_webui.env import SRC_LOG_LEVELS
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
NO_LIMIT = 10000 # Reasonable limit to avoid overwhelming the system
@@ -185,7 +185,7 @@ class PineconeClient(VectorDBBase):
point = {
"id": item["id"],
"values": item["vector"],
"metadata": stringify_metadata(metadata),
"metadata": process_metadata(metadata),
}
points.append(point)
return points

View File

@@ -1,4 +1,4 @@
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.utils import process_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@@ -185,7 +185,7 @@ class S3VectorClient(VectorDBBase):
metadata["text"] = item["text"]
# Convert metadata to string format for consistency
metadata = stringify_metadata(metadata)
metadata = process_metadata(metadata)
# Filter metadata to comply with S3 Vector API limit of 10 keys
metadata = self._filter_metadata(metadata, item["id"])
@@ -256,7 +256,7 @@ class S3VectorClient(VectorDBBase):
metadata["text"] = item["text"]
# Convert metadata to string format for consistency
metadata = stringify_metadata(metadata)
metadata = process_metadata(metadata)
# Filter metadata to comply with S3 Vector API limit of 10 keys
metadata = self._filter_metadata(metadata, item["id"])

View File

@@ -1,10 +1,24 @@
from datetime import datetime
KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]
def stringify_metadata(
def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
metadata = {
key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
}
return metadata
def process_metadata(
metadata: dict[str, any],
) -> dict[str, any]:
for key, value in metadata.items():
# Remove large fields
if key in KEYS_TO_EXCLUDE:
del metadata[key]
# Convert non-serializable fields to strings
if (
isinstance(value, datetime)
or isinstance(value, list)

View File

@@ -78,6 +78,7 @@ from open_webui.retrieval.utils import (
query_doc,
query_doc_with_hybrid_search,
)
from open_webui.retrieval.vector.utils import filter_metadata
from open_webui.utils.misc import (
calculate_sha256_string,
)
@@ -1535,7 +1536,7 @@ def process_file(
Document(
page_content=doc.page_content,
metadata={
**doc.metadata,
**filter_metadata(doc.metadata),
"name": file.filename,
"created_by": file.user_id,
"file_id": file.id,