Add image support for search (#4090)

* add support for image search

* quick fix up

* k

* k

* k

* k

* nit

* quick fix for connector tests
This commit is contained in:
pablonyx
2025-03-05 09:44:18 -08:00
committed by GitHub
parent f731beca1f
commit 20f2b9b2bb
36 changed files with 1857 additions and 589 deletions

View File

@@ -9,15 +9,17 @@ from email.parser import Parser as EmailParser
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import Dict
from typing import IO
from typing import List
from typing import Tuple
import chardet
import docx # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from docx import Document
from docx import Document as DocxDocument
from fastapi import UploadFile
from PIL import Image
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
@@ -31,10 +33,8 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
TEXT_SECTION_SEPARATOR = "\n\n"
PLAIN_TEXT_FILE_EXTENSIONS = [
".txt",
".md",
@@ -49,7 +49,6 @@ PLAIN_TEXT_FILE_EXTENSIONS = [
".yaml",
]
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".pdf",
".docx",
@@ -58,6 +57,16 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".eml",
".epub",
".html",
".png",
".jpg",
".jpeg",
".webp",
]
IMAGE_MEDIA_TYPES = [
"image/png",
"image/jpeg",
"image/webp",
]
@@ -67,11 +76,13 @@ def is_text_file_extension(file_name: str) -> bool:
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
# standardize all extensions to be lowercase so that checks against
# VALID_FILE_EXTENSIONS and similar will work as intended
return extension.lower()
def is_valid_media_type(media_type: str) -> bool:
return media_type in IMAGE_MEDIA_TYPES
def is_valid_file_ext(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS
@@ -79,17 +90,18 @@ def is_valid_file_ext(ext: str) -> bool:
def is_text_file(file: IO[bytes]) -> bool:
"""
checks if the first 1024 bytes only contain printable or whitespace characters
if it does, then we say its a plaintext file
if it does, then we say it's a plaintext file
"""
raw_data = file.read(1024)
file.seek(0)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
return all(c in text_chars for c in raw_data)
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
return encoding
@@ -99,14 +111,14 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
)
# To include additional metadata in the search index, add a .onyx_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
"""
If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
"""
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
@@ -118,24 +130,31 @@ def load_files_from_zip(
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
logger.warning(f"Unable to load {DANSWER_METADATA_FILENAME}")
except KeyError:
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_dirs and file_info.is_dir():
continue
if (
ignore_macos_resource_fork_files
and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
if (
ignore_macos_resource_fork_files
and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue
with zip_file.open(file_info.filename, "r") as subfile:
yield file_info, subfile, zip_metadata.get(file_info.filename, {})
def _extract_onyx_metadata(line: str) -> dict | None:
"""
Example: first line has:
<!-- DANSWER_METADATA={"title": "..."} -->
or
#DANSWER_METADATA={"title":"..."}
"""
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
@@ -161,9 +180,13 @@ def read_text_file(
errors: str = "replace",
ignore_onyx_metadata: bool = True,
) -> tuple[str, dict]:
"""
For plain text files. Optionally extracts Onyx metadata from the first line.
"""
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
# decode
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
@@ -173,131 +196,132 @@ def read_text_file(
else line
)
if ind == 0:
metadata_or_none = (
None if ignore_onyx_metadata else _extract_onyx_metadata(line)
)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
# optionally parse metadata in the first line
if ind == 0 and not ignore_onyx_metadata:
potential_meta = _extract_onyx_metadata(line)
if potential_meta is not None:
metadata = potential_meta
continue
file_content_raw += line
return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
"""Extract text from a PDF file."""
# Return only the extracted text from read_pdf_file
text, _ = read_pdf_file(file, pdf_pass)
"""
Extract text from a PDF. For embedded images, a more complex approach is needed.
This is a minimal approach returning text only.
"""
text, _, _ = read_pdf_file(file, pdf_pass)
return text
def read_pdf_file(
file: IO[Any],
pdf_pass: str | None = None,
) -> tuple[str, dict]:
metadata: Dict[str, Any] = {}
file: IO[Any], pdf_pass: str | None = None, extract_images: bool = False
) -> tuple[str, dict, list[tuple[bytes, str]]]:
"""
Returns the text, basic PDF metadata, and optionally extracted images.
"""
metadata: dict[str, Any] = {}
extracted_images: list[tuple[bytes, str]] = []
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return "", metadata
return "", metadata, []
elif pdf_reader.is_encrypted:
logger.warning("No Password available to decrypt pdf, returning empty")
return "", metadata
logger.warning("No Password for an encrypted PDF, returning empty text.")
return "", metadata, []
# Extract metadata from the PDF, removing leading '/' from keys if present
# This standardizes the metadata keys for consistency
metadata = {}
# Basic PDF metadata
if pdf_reader.metadata is not None:
for key, value in pdf_reader.metadata.items():
clean_key = key.lstrip("/")
if isinstance(value, str) and value.strip():
metadata[clean_key] = value
elif isinstance(value, list) and all(
isinstance(item, str) for item in value
):
metadata[clean_key] = ", ".join(value)
return (
TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
),
metadata,
text = TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
)
if extract_images:
for page_num, page in enumerate(pdf_reader.pages):
for image_file_object in page.images:
image = Image.open(io.BytesIO(image_file_object.data))
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format=image.format)
img_bytes = img_byte_arr.getvalue()
image_name = (
f"page_{page_num + 1}_image_{image_file_object.name}."
f"{image.format.lower() if image.format else 'png'}"
)
extracted_images.append((img_bytes, image_name))
return text, metadata, extracted_images
except PdfStreamError:
logger.exception("PDF file is not a valid PDF")
logger.exception("Invalid PDF file")
except Exception:
logger.exception("Failed to read PDF")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return "", metadata
return "", metadata, []
def docx_to_text(file: IO[Any]) -> str:
def is_simple_table(table: docx.table.Table) -> bool:
for row in table.rows:
# No omitted cells
if row.grid_cols_before > 0 or row.grid_cols_after > 0:
return False
# No nested tables
if any(cell.tables for cell in row.cells):
return False
return True
def extract_cell_text(cell: docx.table._Cell) -> str:
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
return " ".join(p for p in cell_paragraphs if p) or "N/A"
def docx_to_text_and_images(
file: IO[Any],
) -> Tuple[str, List[Tuple[bytes, str]]]:
"""
Extract text from a docx. If embed_images=True, also extract inline images.
Return (text_content, list_of_images).
"""
paragraphs = []
embedded_images: List[Tuple[bytes, str]] = []
doc = docx.Document(file)
for item in doc.iter_inner_content():
if isinstance(item, docx.text.paragraph.Paragraph):
paragraphs.append(item.text)
elif isinstance(item, docx.table.Table):
if not item.rows or not is_simple_table(item):
continue
# Grab text from paragraphs
for paragraph in doc.paragraphs:
paragraphs.append(paragraph.text)
# Every row is a new line, joined with a single newline
table_content = "\n".join(
[
",\t".join(extract_cell_text(cell) for cell in row.cells)
for row in item.rows
]
)
paragraphs.append(table_content)
# Reset position so we can re-load the doc (python-docx has read the stream)
# Note: if python-docx has fully consumed the stream, you may need to open it again from memory.
# For large docs, a more robust approach is needed.
# This is a simplified example.
# Docx already has good spacing between paragraphs
return "\n".join(paragraphs)
for rel_id, rel in doc.part.rels.items():
if "image" in rel.reltype:
# image is typically in rel.target_part.blob
image_bytes = rel.target_part.blob
image_name = rel.target_part.partname
# store
embedded_images.append((image_bytes, os.path.basename(str(image_name))))
text_content = "\n".join(paragraphs)
return text_content, embedded_images
def pptx_to_text(file: IO[Any]) -> str:
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
slide_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
slide_text += shape.text + "\n"
text_content.append(slide_text)
return TEXT_SECTION_SEPARATOR.join(text_content)
@@ -305,18 +329,21 @@ def xlsx_to_text(file: IO[Any]) -> str:
workbook = openpyxl.load_workbook(file, read_only=True)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
rows = []
for row in sheet.iter_rows(min_row=1, values_only=True):
row_str = ",".join(str(cell) if cell is not None else "" for cell in row)
rows.append(row_str)
sheet_str = "\n".join(rows)
text_content.append(sheet_str)
return TEXT_SECTION_SEPARATOR.join(text_content)
def eml_to_text(file: IO[Any]) -> str:
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
encoding = detect_encoding(file)
text_file = io.TextIOWrapper(file, encoding=encoding)
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
@@ -342,8 +369,8 @@ def epub_to_text(file: IO[Any]) -> str:
def file_io_to_text(file: IO[Any]) -> str:
encoding = detect_encoding(file)
file_content_raw, _ = read_text_file(file, encoding=encoding)
return file_content_raw
file_content, _ = read_text_file(file, encoding=encoding)
return file_content
def extract_file_text(
@@ -352,9 +379,13 @@ def extract_file_text(
break_on_unprocessable: bool = True,
extension: str | None = None,
) -> str:
"""
Legacy function that returns *only text*, ignoring embedded images.
For backward-compatibility in code that only wants text.
"""
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text,
".docx": docx_to_text,
".docx": lambda f: docx_to_text_and_images(f)[0], # no images
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,
".eml": eml_to_text,
@@ -368,24 +399,23 @@ def extract_file_text(
return unstructured_to_text(file, file_name)
except Exception as unstructured_error:
logger.error(
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
f"Failed to process with Unstructured: {str(unstructured_error)}. "
"Falling back to normal processing."
)
# Fall through to normal processing
final_extension: str
if file_name or extension:
if extension is not None:
final_extension = extension
elif file_name is not None:
final_extension = get_file_ext(file_name)
if extension is None:
extension = get_file_ext(file_name)
if is_valid_file_ext(final_extension):
return extension_to_function.get(final_extension, file_io_to_text)(file)
if is_valid_file_ext(extension):
func = extension_to_function.get(extension, file_io_to_text)
file.seek(0)
return func(file)
# Either the file somehow has no name or the extension is not one that we recognize
# If unknown extension, maybe it's a text file
file.seek(0)
if is_text_file(file):
return file_io_to_text(file)
raise ValueError("Unknown file extension and unknown text encoding")
raise ValueError("Unknown file extension or not recognized as text data")
except Exception as e:
if break_on_unprocessable:
@@ -396,20 +426,93 @@ def extract_file_text(
return ""
def extract_text_and_images(
file: IO[Any],
file_name: str,
pdf_pass: str | None = None,
) -> Tuple[str, List[Tuple[bytes, str]]]:
"""
Primary new function for the updated connector.
Returns (text_content, [(embedded_img_bytes, embedded_img_name), ...]).
"""
try:
# Attempt unstructured if env var is set
if get_unstructured_api_key():
# If the user doesn't want embedded images, unstructured is fine
file.seek(0)
text_content = unstructured_to_text(file, file_name)
return (text_content, [])
extension = get_file_ext(file_name)
# docx example for embedded images
if extension == ".docx":
file.seek(0)
text_content, images = docx_to_text_and_images(file)
return (text_content, images)
# PDF example: we do not show complicated PDF image extraction here
# so we simply extract text for now and skip images.
if extension == ".pdf":
file.seek(0)
text_content, _, images = read_pdf_file(file, pdf_pass, extract_images=True)
return (text_content, images)
# For PPTX, XLSX, EML, etc., we do not show embedded image logic here.
# You can do something similar to docx if needed.
if extension == ".pptx":
file.seek(0)
return (pptx_to_text(file), [])
if extension == ".xlsx":
file.seek(0)
return (xlsx_to_text(file), [])
if extension == ".eml":
file.seek(0)
return (eml_to_text(file), [])
if extension == ".epub":
file.seek(0)
return (epub_to_text(file), [])
if extension == ".html":
file.seek(0)
return (parse_html_page_basic(file), [])
# If we reach here and it's a recognized text extension
if is_text_file_extension(file_name):
file.seek(0)
encoding = detect_encoding(file)
text_content_raw, _ = read_text_file(
file, encoding=encoding, ignore_onyx_metadata=False
)
return (text_content_raw, [])
# If it's an image file or something else, we do not parse embedded images from them
# just return empty text
file.seek(0)
return ("", [])
except Exception as e:
logger.exception(f"Failed to extract text/images from {file_name}: {e}")
return ("", [])
def convert_docx_to_txt(
file: UploadFile, file_store: FileStore, file_path: str
) -> None:
"""
Helper to convert docx to a .txt file in the same filestore.
"""
file.file.seek(0)
docx_content = file.file.read()
doc = Document(BytesIO(docx_content))
doc = DocxDocument(BytesIO(docx_content))
# Extract text from the document
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Join the extracted text
text_content = "\n".join(full_text)
all_paras = [p.text for p in doc.paragraphs]
text_content = "\n".join(all_paras)
txt_file_path = docx_to_txt_filename(file_path)
file_store.save_file(
@@ -422,7 +525,4 @@ def convert_docx_to_txt(
def docx_to_txt_filename(file_path: str) -> str:
"""
Convert a .docx file path to its corresponding .txt file path.
"""
return file_path.rsplit(".", 1)[0] + ".txt"

View File

@@ -0,0 +1,46 @@
"""
Centralized file type validation utilities.
"""
# Standard image MIME types supported by most vision LLMs
IMAGE_MIME_TYPES = [
"image/png",
"image/jpeg",
"image/jpg",
"image/webp",
]
# Image types that should be excluded from processing
EXCLUDED_IMAGE_TYPES = [
"image/bmp",
"image/tiff",
"image/gif",
"image/svg+xml",
]
def is_valid_image_type(mime_type: str) -> bool:
"""
Check if mime_type is a valid image type.
Args:
mime_type: The MIME type to check
Returns:
True if the MIME type is a valid image type, False otherwise
"""
if not mime_type:
return False
return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
def is_supported_by_vision_llm(mime_type: str) -> bool:
"""
Check if this image type can be processed by vision LLMs.
Args:
mime_type: The MIME type to check
Returns:
True if the MIME type is supported by vision LLMs, False otherwise
"""
return mime_type in IMAGE_MIME_TYPES

View File

@@ -0,0 +1,129 @@
import base64
from io import BytesIO
from langchain_core.messages import BaseMessage
from langchain_core.messages import HumanMessage
from langchain_core.messages import SystemMessage
from PIL import Image
from onyx.llm.interfaces import LLM
from onyx.llm.utils import message_to_string
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_SYSTEM_PROMPT
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_USER_PROMPT
from onyx.utils.logger import setup_logger
logger = setup_logger()
def prepare_image_bytes(image_data: bytes) -> str:
"""Prepare image bytes for summarization.
Resizes image if it's larger than 20MB. Encodes image as a base64 string."""
image_data = _resize_image_if_needed(image_data)
# encode image (base64)
encoded_image = _encode_image_for_llm_prompt(image_data)
return encoded_image
def summarize_image_pipeline(
llm: LLM,
image_data: bytes,
query: str | None = None,
system_prompt: str | None = None,
) -> str:
"""Pipeline to generate a summary of an image.
Resizes images if it is bigger than 20MB. Encodes image as a base64 string.
And finally uses the Default LLM to generate a textual summary of the image."""
# resize image if it's bigger than 20MB
encoded_image = prepare_image_bytes(image_data)
summary = _summarize_image(
encoded_image,
llm,
query,
system_prompt,
)
return summary
def summarize_image_with_error_handling(
llm: LLM | None,
image_data: bytes,
context_name: str,
system_prompt: str = IMAGE_SUMMARIZATION_SYSTEM_PROMPT,
user_prompt_template: str = IMAGE_SUMMARIZATION_USER_PROMPT,
) -> str | None:
"""Wrapper function that handles error cases and configuration consistently.
Args:
llm: The LLM with vision capabilities to use for summarization
image_data: The raw image bytes
context_name: Name or title of the image for context
system_prompt: System prompt to use for the LLM
user_prompt_template: Template for the user prompt, should contain {title} placeholder
Returns:
The image summary text, or None if summarization failed or is disabled
"""
if llm is None:
return None
user_prompt = user_prompt_template.format(title=context_name)
return summarize_image_pipeline(llm, image_data, user_prompt, system_prompt)
def _summarize_image(
encoded_image: str,
llm: LLM,
query: str | None = None,
system_prompt: str | None = None,
) -> str:
"""Use default LLM (if it is multimodal) to generate a summary of an image."""
messages: list[BaseMessage] = []
if system_prompt:
messages.append(SystemMessage(content=system_prompt))
messages.append(
HumanMessage(
content=[
{"type": "text", "text": query},
{"type": "image_url", "image_url": {"url": encoded_image}},
],
),
)
try:
return message_to_string(llm.invoke(messages))
except Exception as e:
raise ValueError(f"Summarization failed. Messages: {messages}") from e
def _encode_image_for_llm_prompt(image_data: bytes) -> str:
"""Getting the base64 string."""
base64_encoded_data = base64.b64encode(image_data).decode("utf-8")
return f"data:image/jpeg;base64,{base64_encoded_data}"
def _resize_image_if_needed(image_data: bytes, max_size_mb: int = 20) -> bytes:
"""Resize image if it's larger than the specified max size in MB."""
max_size_bytes = max_size_mb * 1024 * 1024
if len(image_data) > max_size_bytes:
with Image.open(BytesIO(image_data)) as img:
# Reduce dimensions for better size reduction
img.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
output = BytesIO()
# Save with lower quality for compression
img.save(output, format="JPEG", quality=85)
resized_data = output.getvalue()
return resized_data
return image_data

View File

@@ -0,0 +1,70 @@
from typing import Tuple
from sqlalchemy.orm import Session
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Section
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
def store_image_and_create_section(
db_session: Session,
image_data: bytes,
file_name: str,
display_name: str,
media_type: str = "image/unknown",
llm: LLM | None = None,
file_origin: FileOrigin = FileOrigin.OTHER,
) -> Tuple[Section, str | None]:
"""
Stores an image in PGFileStore and creates a Section object with optional summarization.
Args:
db_session: Database session
image_data: Raw image bytes
file_name: Base identifier for the file
display_name: Human-readable name for the image
media_type: MIME type of the image
llm: Optional LLM with vision capabilities for summarization
file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
Returns:
Tuple containing:
- Section object with image reference and optional summary text
- The file_name in PGFileStore or None if storage failed
"""
# Storage logic
stored_file_name = None
try:
pgfilestore = save_bytes_to_pgfilestore(
db_session=db_session,
raw_bytes=image_data,
media_type=media_type,
identifier=file_name,
display_name=display_name,
file_origin=file_origin,
)
stored_file_name = pgfilestore.file_name
except Exception as e:
logger.error(f"Failed to store image: {e}")
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise
return Section(text=""), None
# Summarization logic
summary_text = ""
if llm:
summary_text = (
summarize_image_with_error_handling(llm, image_data, display_name) or ""
)
return (
Section(text=summary_text, image_file_name=stored_file_name),
stored_file_name,
)