Separate out indexing-time image analysis into new phase (#4228)

* Separate out indexing-time image analysis into new phase

* looking good

* k

* k
This commit is contained in:
pablonyx 2025-03-12 15:26:05 -07:00 committed by GitHub
parent 5883336d5e
commit f87e559cc4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
66 changed files with 777 additions and 657 deletions

View File

@ -28,6 +28,7 @@ from onyx.connectors.models import ConnectorCheckpoint
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import TextSection
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.connector_credential_pair import get_last_successful_attempt_time
from onyx.db.connector_credential_pair import update_connector_credential_pair
@ -154,14 +155,12 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
)
for section in cleaned_doc.sections:
if section.link and "\x00" in section.link:
logger.warning(
f"NUL characters found in document link for document: {cleaned_doc.id}"
)
if section.link is not None:
section.link = section.link.replace("\x00", "")
# since text can be longer, just replace to avoid double scan
section.text = section.text.replace("\x00", "")
if isinstance(section, TextSection) and section.text is not None:
section.text = section.text.replace("\x00", "")
cleaned_batch.append(cleaned_doc)
@ -479,7 +478,11 @@ def _run_indexing(
doc_size = 0
for section in doc.sections:
doc_size += len(section.text)
if (
isinstance(section, TextSection)
and section.text is not None
):
doc_size += len(section.text)
if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
logger.warning(

View File

@ -4,6 +4,7 @@ from concurrent.futures import Future
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from typing import Any
from typing import cast
import requests
from pyairtable import Api as AirtableApi
@ -16,7 +17,8 @@ from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.utils.logger import setup_logger
@ -267,7 +269,7 @@ class AirtableConnector(LoadConnector):
table_id: str,
view_id: str | None,
record_id: str,
) -> tuple[list[Section], dict[str, str | list[str]]]:
) -> tuple[list[TextSection], dict[str, str | list[str]]]:
"""
Process a single Airtable field and return sections or metadata.
@ -305,7 +307,7 @@ class AirtableConnector(LoadConnector):
# Otherwise, create relevant sections
sections = [
Section(
TextSection(
link=link,
text=(
f"{field_name}:\n"
@ -340,7 +342,7 @@ class AirtableConnector(LoadConnector):
table_name = table_schema.name
record_id = record["id"]
fields = record["fields"]
sections: list[Section] = []
sections: list[TextSection] = []
metadata: dict[str, str | list[str]] = {}
# Get primary field value if it exists
@ -384,7 +386,7 @@ class AirtableConnector(LoadConnector):
return Document(
id=f"airtable__{record_id}",
sections=sections,
sections=(cast(list[TextSection | ImageSection], sections)),
source=DocumentSource.AIRTABLE,
semantic_identifier=semantic_id,
metadata=metadata,

View File

@ -10,7 +10,7 @@ from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -82,7 +82,7 @@ class AsanaConnector(LoadConnector, PollConnector):
logger.debug(f"Converting Asana task {task.id} to Document")
return Document(
id=task.id,
sections=[Section(link=task.link, text=task.text)],
sections=[TextSection(link=task.link, text=task.text)],
doc_updated_at=task.last_modified,
source=DocumentSource.ASANA,
semantic_identifier=task.title,

View File

@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
@ -221,7 +221,7 @@ def _get_forums(
def _translate_forum_to_doc(af: AxeroForum) -> Document:
doc = Document(
id=af.doc_id,
sections=[Section(link=af.link, text=reply) for reply in af.responses],
sections=[TextSection(link=af.link, text=reply) for reply in af.responses],
source=DocumentSource.AXERO,
semantic_identifier=af.title,
doc_updated_at=af.last_update,
@ -244,7 +244,7 @@ def _translate_content_to_doc(content: dict) -> Document:
doc = Document(
id="AXERO_" + str(content["ContentID"]),
sections=[Section(link=content["ContentURL"], text=page_text)],
sections=[TextSection(link=content["ContentURL"], text=page_text)],
source=DocumentSource.AXERO,
semantic_identifier=content["ContentTitle"],
doc_updated_at=time_str_to_utc(content["DateUpdated"]),

View File

@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.utils.logger import setup_logger
@ -208,7 +208,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
batch.append(
Document(
id=f"{self.bucket_type}:{self.bucket_name}:{obj['Key']}",
sections=[Section(link=link, text=text)],
sections=[TextSection(link=link, text=text)],
source=DocumentSource(self.bucket_type.value),
semantic_identifier=name,
doc_updated_at=last_modified,
@ -341,7 +341,14 @@ if __name__ == "__main__":
print("Sections:")
for section in doc.sections:
print(f" - Link: {section.link}")
print(f" - Text: {section.text[:100]}...")
if isinstance(section, TextSection) and section.text is not None:
print(f" - Text: {section.text[:100]}...")
elif (
hasattr(section, "image_file_name") and section.image_file_name
):
print(f" - Image: {section.image_file_name}")
else:
print("Error: Unknown section type")
print("---")
break

View File

@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
@ -81,7 +81,7 @@ class BookstackConnector(LoadConnector, PollConnector):
)
return Document(
id="book__" + str(book.get("id")),
sections=[Section(link=url, text=text)],
sections=[TextSection(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Book: " + title,
title=title,
@ -110,7 +110,7 @@ class BookstackConnector(LoadConnector, PollConnector):
)
return Document(
id="chapter__" + str(chapter.get("id")),
sections=[Section(link=url, text=text)],
sections=[TextSection(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Chapter: " + title,
title=title,
@ -134,7 +134,7 @@ class BookstackConnector(LoadConnector, PollConnector):
)
return Document(
id="shelf:" + str(shelf.get("id")),
sections=[Section(link=url, text=text)],
sections=[TextSection(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Shelf: " + title,
title=title,
@ -167,7 +167,7 @@ class BookstackConnector(LoadConnector, PollConnector):
time.sleep(0.1)
return Document(
id="page:" + page_id,
sections=[Section(link=url, text=text)],
sections=[TextSection(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Page: " + str(title),
title=str(title),

View File

@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.retry_wrapper import retry_builder
@ -62,11 +62,11 @@ class ClickupConnector(LoadConnector, PollConnector):
return response.json()
def _get_task_comments(self, task_id: str) -> list[Section]:
def _get_task_comments(self, task_id: str) -> list[TextSection]:
url_endpoint = f"/task/{task_id}/comment"
response = self._make_request(url_endpoint)
comments = [
Section(
TextSection(
link=f'https://app.clickup.com/t/{task_id}?comment={comment_dict["id"]}',
text=comment_dict["comment_text"],
)
@ -133,7 +133,7 @@ class ClickupConnector(LoadConnector, PollConnector):
],
title=task["name"],
sections=[
Section(
TextSection(
link=task["url"],
text=(
task["markdown_description"]

View File

@ -33,9 +33,9 @@ from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.connectors.models import TextSection
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
@ -85,7 +85,6 @@ class ConfluenceConnector(
PollConnector,
SlimConnector,
CredentialsConnector,
VisionEnabledConnector,
):
def __init__(
self,
@ -116,9 +115,6 @@ class ConfluenceConnector(
self._confluence_client: OnyxConfluence | None = None
self._fetched_titles: set[str] = set()
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
# Remove trailing slash from wiki_base if present
self.wiki_base = wiki_base.rstrip("/")
"""
@ -245,12 +241,16 @@ class ConfluenceConnector(
)
# Create the main section for the page content
sections = [Section(text=page_content, link=page_url)]
sections: list[TextSection | ImageSection] = [
TextSection(text=page_content, link=page_url)
]
# Process comments if available
comment_text = self._get_comment_string_for_page_id(page_id)
if comment_text:
sections.append(Section(text=comment_text, link=f"{page_url}#comments"))
sections.append(
TextSection(text=comment_text, link=f"{page_url}#comments")
)
# Process attachments
if "children" in page and "attachment" in page["children"]:
@ -264,21 +264,26 @@ class ConfluenceConnector(
self.confluence_client,
attachment,
page_id,
page_title,
self.image_analysis_llm,
)
if result.text:
if result and result.text:
# Create a section for the attachment text
attachment_section = Section(
attachment_section = TextSection(
text=result.text,
link=f"{page_url}#attachment-{attachment['id']}",
)
sections.append(attachment_section)
elif result and result.file_name:
# Create an ImageSection for image attachments
image_section = ImageSection(
link=f"{page_url}#attachment-{attachment['id']}",
image_file_name=result.file_name,
)
sections.append(attachment_section)
elif result.error:
sections.append(image_section)
else:
logger.warning(
f"Error processing attachment '{attachment.get('title')}': {result.error}"
f"Error processing attachment '{attachment.get('title')}':",
f"{result.error if result else 'Unknown error'}",
)
# Extract metadata
@ -349,7 +354,7 @@ class ConfluenceConnector(
# Now get attachments for that page:
attachment_query = self._construct_attachment_query(page["id"])
# We'll use the page's XML to provide context if we summarize an image
confluence_xml = page.get("body", {}).get("storage", {}).get("value", "")
page.get("body", {}).get("storage", {}).get("value", "")
for attachment in self.confluence_client.paginated_cql_retrieval(
cql=attachment_query,
@ -357,7 +362,7 @@ class ConfluenceConnector(
):
attachment["metadata"].get("mediaType", "")
if not validate_attachment_filetype(
attachment, self.image_analysis_llm
attachment,
):
continue
@ -368,23 +373,25 @@ class ConfluenceConnector(
confluence_client=self.confluence_client,
attachment=attachment,
page_id=page["id"],
page_context=confluence_xml,
llm=self.image_analysis_llm,
)
if response is None:
continue
content_text, file_storage_name = response
object_url = build_confluence_document_id(
self.wiki_base, attachment["_links"]["webui"], self.is_cloud
)
if content_text:
doc.sections.append(
Section(
TextSection(
text=content_text,
link=object_url,
)
)
elif file_storage_name:
doc.sections.append(
ImageSection(
link=object_url,
image_file_name=file_storage_name,
)
)
@ -464,7 +471,7 @@ class ConfluenceConnector(
# If you skip images, you'll skip them in the permission sync
attachment["metadata"].get("mediaType", "")
if not validate_attachment_filetype(
attachment, self.image_analysis_llm
attachment,
):
continue

View File

@ -36,7 +36,6 @@ from onyx.db.pg_file_store import upsert_pgfilestore
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -54,17 +53,16 @@ class TokenResponse(BaseModel):
def validate_attachment_filetype(
attachment: dict[str, Any], llm: LLM | None = None
attachment: dict[str, Any],
) -> bool:
"""
Validates if the attachment is a supported file type.
If LLM is provided, also checks if it's an image that can be processed.
"""
attachment.get("metadata", {})
media_type = attachment.get("metadata", {}).get("mediaType", "")
if media_type.startswith("image/"):
return llm is not None and is_valid_image_type(media_type)
return is_valid_image_type(media_type)
# For non-image files, check if we support the extension
title = attachment.get("title", "")
@ -114,19 +112,16 @@ def process_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
parent_content_id: str | None,
page_context: str,
llm: LLM | None,
) -> AttachmentProcessingResult:
"""
Processes a Confluence attachment. If it's a document, extracts text,
or if it's an image and an LLM is available, summarizes it. Returns a structured result.
or if it's an image, stores it for later analysis. Returns a structured result.
"""
try:
# Get the media type from the attachment metadata
media_type = attachment.get("metadata", {}).get("mediaType", "")
# Validate the attachment type
if not validate_attachment_filetype(attachment, llm):
if not validate_attachment_filetype(attachment):
return AttachmentProcessingResult(
text=None,
file_name=None,
@ -143,7 +138,7 @@ def process_attachment(
attachment_size = attachment["extensions"]["fileSize"]
if not media_type.startswith("image/") or not llm:
if not media_type.startswith("image/"):
if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD:
logger.warning(
f"Skipping {attachment_link} due to size. "
@ -181,10 +176,10 @@ def process_attachment(
text=None, file_name=None, error="attachment.content is None"
)
# Process image attachments with LLM if available
if media_type.startswith("image/") and llm:
# Process image attachments
if media_type.startswith("image/"):
return _process_image_attachment(
confluence_client, attachment, page_context, llm, raw_bytes, media_type
confluence_client, attachment, raw_bytes, media_type
)
# Process document attachments
@ -217,12 +212,10 @@ def process_attachment(
def _process_image_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_context: str,
llm: LLM,
raw_bytes: bytes,
media_type: str,
) -> AttachmentProcessingResult:
"""Process an image attachment by saving it and generating a summary."""
"""Process an image attachment by saving it without generating a summary."""
try:
# Use the standardized image storage and section creation
with get_session_with_current_tenant() as db_session:
@ -232,15 +225,14 @@ def _process_image_attachment(
file_name=Path(attachment["id"]).name,
display_name=attachment["title"],
media_type=media_type,
llm=llm,
file_origin=FileOrigin.CONNECTOR,
)
logger.info(f"Stored image attachment with file name: {file_name}")
return AttachmentProcessingResult(
text=section.text, file_name=file_name, error=None
)
# Return empty text but include the file_name for later processing
return AttachmentProcessingResult(text="", file_name=file_name, error=None)
except Exception as e:
msg = f"Image summarization failed for {attachment['title']}: {e}"
msg = f"Image storage failed for {attachment['title']}: {e}"
logger.error(msg, exc_info=e)
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
@ -302,13 +294,11 @@ def convert_attachment_to_content(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_id: str,
page_context: str,
llm: LLM | None,
) -> tuple[str | None, str | None] | None:
"""
Facade function which:
1. Validates attachment type
2. Extracts or summarizes content
2. Extracts content or stores image for later processing
3. Returns (content_text, stored_file_name) or None if we should skip it
"""
media_type = attachment["metadata"]["mediaType"]
@ -319,9 +309,7 @@ def convert_attachment_to_content(
)
return None
result = process_attachment(
confluence_client, attachment, page_id, page_context, llm
)
result = process_attachment(confluence_client, attachment, page_id)
if result.error is not None:
logger.warning(
f"Attachment {attachment['title']} encountered error: {result.error}"

View File

@ -4,6 +4,7 @@ from collections.abc import Iterable
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import cast
from discord import Client
from discord.channel import TextChannel
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -32,7 +34,7 @@ _SNIPPET_LENGTH = 30
def _convert_message_to_document(
message: DiscordMessage,
sections: list[Section],
sections: list[TextSection],
) -> Document:
"""
Convert a discord message to a document
@ -78,7 +80,7 @@ def _convert_message_to_document(
semantic_identifier=semantic_identifier,
doc_updated_at=message.edited_at,
title=title,
sections=sections,
sections=(cast(list[TextSection | ImageSection], sections)),
metadata=metadata,
)
@ -123,8 +125,8 @@ async def _fetch_documents_from_channel(
if channel_message.type != MessageType.default:
continue
sections: list[Section] = [
Section(
sections: list[TextSection] = [
TextSection(
text=channel_message.content,
link=channel_message.jump_url,
)
@ -142,7 +144,7 @@ async def _fetch_documents_from_channel(
continue
sections = [
Section(
TextSection(
text=thread_message.content,
link=thread_message.jump_url,
)
@ -160,7 +162,7 @@ async def _fetch_documents_from_channel(
continue
sections = [
Section(
TextSection(
text=thread_message.content,
link=thread_message.jump_url,
)

View File

@ -3,6 +3,7 @@ import urllib.parse
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import cast
import requests
from pydantic import BaseModel
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
@ -112,7 +114,7 @@ class DiscourseConnector(PollConnector):
responders.append(BasicExpertInfo(display_name=responder_name))
sections.append(
Section(link=topic_url, text=parse_html_page_basic(post["cooked"]))
TextSection(link=topic_url, text=parse_html_page_basic(post["cooked"]))
)
category_name = self.category_id_map.get(topic["category_id"])
@ -129,7 +131,7 @@ class DiscourseConnector(PollConnector):
doc = Document(
id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]),
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.DISCOURSE,
semantic_identifier=topic["title"],
doc_updated_at=time_str_to_utc(topic["last_posted_at"]),

View File

@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.retry_wrapper import retry_builder
@ -158,7 +158,7 @@ class Document360Connector(LoadConnector, PollConnector):
document = Document(
id=article_details["id"],
sections=[Section(link=doc_link, text=doc_text)],
sections=[TextSection(link=doc_link, text=doc_text)],
source=DocumentSource.DOCUMENT360,
semantic_identifier=article_details["title"],
doc_updated_at=updated_at,

View File

@ -19,7 +19,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.utils.logger import setup_logger
@ -108,7 +108,7 @@ class DropboxConnector(LoadConnector, PollConnector):
batch.append(
Document(
id=f"doc:{entry.id}",
sections=[Section(link=link, text=text)],
sections=[TextSection(link=link, text=text)],
source=DocumentSource.DROPBOX,
semantic_identifier=entry.name,
doc_updated_at=modified_time,

View File

@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import detect_encoding
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
@ -111,7 +111,7 @@ def _process_egnyte_file(
# Create the document
return Document(
id=f"egnyte-{file_metadata['entry_id']}",
sections=[Section(text=file_content_raw.strip(), link=web_url)],
sections=[TextSection(text=file_content_raw.strip(), link=web_url)],
source=DocumentSource.EGNYTE,
semantic_identifier=file_name,
metadata=metadata,

View File

@ -16,8 +16,8 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
from onyx.file_processing.extract_file_text import extract_text_and_images
@ -26,7 +26,6 @@ from onyx.file_processing.extract_file_text import is_valid_file_ext
from onyx.file_processing.extract_file_text import load_files_from_zip
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.file_store.file_store import get_default_file_store
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -59,32 +58,44 @@ def _read_files_and_metadata(
def _create_image_section(
llm: LLM | None,
image_data: bytes,
db_session: Session,
parent_file_name: str,
display_name: str,
link: str | None = None,
idx: int = 0,
) -> tuple[Section, str | None]:
) -> tuple[ImageSection, str | None]:
"""
Create a Section object for a single image and store the image in PGFileStore.
If summarization is enabled and we have an LLM, summarize the image.
Creates an ImageSection for an image file or embedded image.
Stores the image in PGFileStore but does not generate a summary.
Args:
image_data: Raw image bytes
db_session: Database session
parent_file_name: Name of the parent file (for embedded images)
display_name: Display name for the image
idx: Index for embedded images
Returns:
tuple: (Section object, file_name in PGFileStore or None if storage failed)
Tuple of (ImageSection, stored_file_name or None)
"""
# Create a unique file name for the embedded image
file_name = f"{parent_file_name}_embedded_{idx}"
# Create a unique identifier for the image
file_name = f"{parent_file_name}_embedded_{idx}" if idx > 0 else parent_file_name
# Use the standardized utility to store the image and create a section
return store_image_and_create_section(
db_session=db_session,
image_data=image_data,
file_name=file_name,
display_name=display_name,
llm=llm,
file_origin=FileOrigin.OTHER,
)
# Store the image and create a section
try:
section, stored_file_name = store_image_and_create_section(
db_session=db_session,
image_data=image_data,
file_name=file_name,
display_name=display_name,
link=link,
file_origin=FileOrigin.CONNECTOR,
)
return section, stored_file_name
except Exception as e:
logger.error(f"Failed to store image {display_name}: {e}")
raise e
def _process_file(
@ -93,12 +104,16 @@ def _process_file(
metadata: dict[str, Any] | None,
pdf_pass: str | None,
db_session: Session,
llm: LLM | None,
) -> list[Document]:
"""
Processes a single file, returning a list of Documents (typically one).
Also handles embedded images if 'EMBEDDED_IMAGE_EXTRACTION_ENABLED' is true.
Process a file and return a list of Documents.
For images, creates ImageSection objects without summarization.
For documents with embedded images, extracts and stores the images.
"""
if metadata is None:
metadata = {}
# Get file extension and determine file type
extension = get_file_ext(file_name)
# Fetch the DB record so we know the ID for internal URL
@ -114,8 +129,6 @@ def _process_file(
return []
# Prepare doc metadata
if metadata is None:
metadata = {}
file_display_name = metadata.get("file_display_name") or os.path.basename(file_name)
# Timestamps
@ -158,6 +171,7 @@ def _process_file(
"title",
"connector_type",
"pdf_password",
"mime_type",
]
}
@ -170,33 +184,45 @@ def _process_file(
title = metadata.get("title") or file_display_name
# 1) If the file itself is an image, handle that scenario quickly
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
if extension in IMAGE_EXTENSIONS:
# Summarize or produce empty doc
if extension in LoadConnector.IMAGE_EXTENSIONS:
# Read the image data
image_data = file.read()
image_section, _ = _create_image_section(
llm, image_data, db_session, pg_record.file_name, title
)
return [
Document(
id=doc_id,
sections=[image_section],
source=source_type,
semantic_identifier=file_display_name,
title=title,
doc_updated_at=final_time_updated,
primary_owners=p_owners,
secondary_owners=s_owners,
metadata=metadata_tags,
)
]
if not image_data:
logger.warning(f"Empty image file: {file_name}")
return []
# 2) Otherwise: text-based approach. Possibly with embedded images if enabled.
# (For example .docx with inline images).
# Create an ImageSection for the image
try:
section, _ = _create_image_section(
image_data=image_data,
db_session=db_session,
parent_file_name=pg_record.file_name,
display_name=title,
)
return [
Document(
id=doc_id,
sections=[section],
source=source_type,
semantic_identifier=file_display_name,
title=title,
doc_updated_at=final_time_updated,
primary_owners=p_owners,
secondary_owners=s_owners,
metadata=metadata_tags,
)
]
except Exception as e:
logger.error(f"Failed to process image file {file_name}: {e}")
return []
# 2) Otherwise: text-based approach. Possibly with embedded images.
file.seek(0)
text_content = ""
embedded_images: list[tuple[bytes, str]] = []
# Extract text and images from the file
text_content, embedded_images = extract_text_and_images(
file=file,
file_name=file_name,
@ -204,24 +230,29 @@ def _process_file(
)
# Build sections: first the text as a single Section
sections = []
sections: list[TextSection | ImageSection] = []
link_in_meta = metadata.get("link")
if text_content.strip():
sections.append(Section(link=link_in_meta, text=text_content.strip()))
sections.append(TextSection(link=link_in_meta, text=text_content.strip()))
# Then any extracted images from docx, etc.
for idx, (img_data, img_name) in enumerate(embedded_images, start=1):
# Store each embedded image as a separate file in PGFileStore
# and create a section with the image summary
image_section, _ = _create_image_section(
llm,
img_data,
db_session,
pg_record.file_name,
f"{title} - image {idx}",
idx,
)
sections.append(image_section)
# and create a section with the image reference
try:
image_section, _ = _create_image_section(
image_data=img_data,
db_session=db_session,
parent_file_name=pg_record.file_name,
display_name=f"{title} - image {idx}",
idx=idx,
)
sections.append(image_section)
except Exception as e:
logger.warning(
f"Failed to process embedded image {idx} in {file_name}: {e}"
)
return [
Document(
id=doc_id,
@ -237,10 +268,10 @@ def _process_file(
]
class LocalFileConnector(LoadConnector, VisionEnabledConnector):
class LocalFileConnector(LoadConnector):
"""
Connector that reads files from Postgres and yields Documents, including
optional embedded image extraction.
embedded image extraction without summarization.
"""
def __init__(
@ -252,9 +283,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
self.batch_size = batch_size
self.pdf_pass: str | None = None
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.pdf_pass = credentials.get("pdf_password")
@ -286,7 +314,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
metadata=metadata,
pdf_pass=self.pdf_pass,
db_session=db_session,
llm=self.image_analysis_llm,
)
documents.extend(new_docs)

View File

@ -1,6 +1,7 @@
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import cast
from typing import List
import requests
@ -14,7 +15,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -45,7 +47,7 @@ _FIREFLIES_API_QUERY = """
def _create_doc_from_transcript(transcript: dict) -> Document | None:
sections: List[Section] = []
sections: List[TextSection] = []
current_speaker_name = None
current_link = ""
current_text = ""
@ -57,7 +59,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
if sentence["speaker_name"] != current_speaker_name:
if current_speaker_name is not None:
sections.append(
Section(
TextSection(
link=current_link,
text=current_text.strip(),
)
@ -71,7 +73,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
# Sometimes these links (links with a timestamp) do not work, it is a bug with Fireflies.
sections.append(
Section(
TextSection(
link=current_link,
text=current_text.strip(),
)
@ -94,7 +96,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
return Document(
id=fireflies_id,
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.FIREFLIES,
semantic_identifier=meeting_title,
metadata={},

View File

@ -14,7 +14,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.logger import setup_logger
@ -133,7 +133,7 @@ def _create_doc_from_ticket(ticket: dict, domain: str) -> Document:
return Document(
id=_FRESHDESK_ID_PREFIX + link,
sections=[
Section(
TextSection(
link=link,
text=text,
)

View File

@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
@ -183,7 +183,7 @@ def _convert_page_to_document(
return Document(
id=f"gitbook-{space_id}-{page_id}",
sections=[
Section(
TextSection(
link=page.get("urls", {}).get("app", ""),
text=_extract_text_from_document(page_content),
)

View File

@ -27,7 +27,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.batching import batch_generator
from onyx.utils.logger import setup_logger
@ -87,7 +87,9 @@ def _batch_github_objects(
def _convert_pr_to_document(pull_request: PullRequest) -> Document:
return Document(
id=pull_request.html_url,
sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
sections=[
TextSection(link=pull_request.html_url, text=pull_request.body or "")
],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
# updated_at is UTC time but is timezone unaware, explicitly add UTC
@ -109,7 +111,7 @@ def _fetch_issue_comments(issue: Issue) -> str:
def _convert_issue_to_document(issue: Issue) -> Document:
return Document(
id=issue.html_url,
sections=[Section(link=issue.html_url, text=issue.body or "")],
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
source=DocumentSource.GITHUB,
semantic_identifier=issue.title,
# updated_at is UTC time but is timezone unaware

View File

@ -21,7 +21,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
@ -56,7 +56,7 @@ def get_author(author: Any) -> BasicExpertInfo:
def _convert_merge_request_to_document(mr: Any) -> Document:
doc = Document(
id=mr.web_url,
sections=[Section(link=mr.web_url, text=mr.description or "")],
sections=[TextSection(link=mr.web_url, text=mr.description or "")],
source=DocumentSource.GITLAB,
semantic_identifier=mr.title,
# updated_at is UTC time but is timezone unaware, explicitly add UTC
@ -72,7 +72,7 @@ def _convert_merge_request_to_document(mr: Any) -> Document:
def _convert_issue_to_document(issue: Any) -> Document:
doc = Document(
id=issue.web_url,
sections=[Section(link=issue.web_url, text=issue.description or "")],
sections=[TextSection(link=issue.web_url, text=issue.description or "")],
source=DocumentSource.GITLAB,
semantic_identifier=issue.title,
# updated_at is UTC time but is timezone unaware, explicitly add UTC
@ -99,7 +99,7 @@ def _convert_code_to_document(
file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL
doc = Document(
id=file["id"],
sections=[Section(link=file_url, text=file_content)],
sections=[TextSection(link=file_url, text=file_content)],
source=DocumentSource.GITLAB,
semantic_identifier=file["name"],
doc_updated_at=datetime.now().replace(

View File

@ -1,5 +1,6 @@
from base64 import urlsafe_b64decode
from typing import Any
from typing import cast
from typing import Dict
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
@ -28,8 +29,9 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
@ -115,7 +117,7 @@ def _get_message_body(payload: dict[str, Any]) -> str:
return message_body
def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]]:
def message_to_section(message: Dict[str, Any]) -> tuple[TextSection, dict[str, str]]:
link = f"https://mail.google.com/mail/u/0/#inbox/{message['id']}"
payload = message.get("payload", {})
@ -142,7 +144,7 @@ def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]
message_body_text: str = _get_message_body(payload)
return Section(link=link, text=message_body_text + message_data), metadata
return TextSection(link=link, text=message_body_text + message_data), metadata
def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
@ -192,7 +194,7 @@ def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
return Document(
id=id,
semantic_identifier=semantic_identifier,
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.GMAIL,
# This is used to perform permission sync
primary_owners=primary_owners,

View File

@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
@ -243,7 +243,7 @@ class GongConnector(LoadConnector, PollConnector):
Document(
id=call_id,
sections=[
Section(link=call_metadata["url"], text=transcript_text)
TextSection(link=call_metadata["url"], text=transcript_text)
],
source=DocumentSource.GONG,
# Should not ever be Untitled as a call cannot be made without a Title

View File

@ -43,9 +43,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
@ -68,7 +66,6 @@ def _convert_single_file(
creds: Any,
primary_admin_email: str,
file: dict[str, Any],
image_analysis_llm: LLM | None,
) -> Any:
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_drive_service = get_drive_service(creds, user_email=user_email)
@ -77,7 +74,6 @@ def _convert_single_file(
file=file,
drive_service=user_drive_service,
docs_service=docs_service,
image_analysis_llm=image_analysis_llm, # pass the LLM so doc_conversion can summarize images
)
@ -116,9 +112,7 @@ def _clean_requested_drive_ids(
return valid_requested_drive_ids, filtered_folder_ids
class GoogleDriveConnector(
LoadConnector, PollConnector, SlimConnector, VisionEnabledConnector
):
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
def __init__(
self,
include_shared_drives: bool = False,
@ -151,9 +145,6 @@ class GoogleDriveConnector(
if continue_on_failure is not None:
logger.warning("The 'continue_on_failure' parameter is deprecated.")
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
if (
not include_shared_drives
and not include_my_drives
@ -539,7 +530,6 @@ class GoogleDriveConnector(
_convert_single_file,
self.creds,
self.primary_admin_email,
image_analysis_llm=self.image_analysis_llm, # Use the mixin's LLM
)
# Fetch files in batches

View File

@ -1,40 +1,50 @@
import io
from datetime import datetime
from datetime import timezone
from tempfile import NamedTemporaryFile
from typing import cast
import openpyxl # type: ignore
from googleapiclient.discovery import build # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from googleapiclient.http import MediaIoBaseDownload # type: ignore
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
from onyx.connectors.google_drive.models import GDriveMimeType
from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_drive.section_extraction import get_document_sections
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.db.engine import get_session_with_current_tenant
from onyx.file_processing.extract_file_text import docx_to_text_and_images
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import pptx_to_text
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.extract_file_text import xlsx_to_text
from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.file_processing.unstructured import get_unstructured_api_key
from onyx.file_processing.unstructured import unstructured_to_text
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
GDriveMimeType.DOC.value: "text/plain",
GDriveMimeType.SPREADSHEET.value: "text/csv",
GDriveMimeType.PPT.value: "text/plain",
}
# Define Google MIME types mapping
GOOGLE_MIME_TYPES = {
GDriveMimeType.DOC.value: "text/plain",
GDriveMimeType.SPREADSHEET.value: "text/csv",
GDriveMimeType.PPT.value: "text/plain",
}
def _summarize_drive_image(
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
@ -66,259 +76,137 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
def _extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
image_analysis_llm: LLM | None = None,
) -> list[Section]:
"""
Extends the existing logic to handle either a docx with embedded images
or standalone images (PNG, JPG, etc).
"""
) -> list[TextSection | ImageSection]:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
file_name = file["name"]
mime_type = file["mimeType"]
link = file["webViewLink"]
file_name = file.get("name", file["id"])
supported_file_types = set(item.value for item in GDriveMimeType)
# 1) If the file is an image, retrieve the raw bytes, optionally summarize
if is_gdrive_image_mime_type(mime_type):
try:
response = service.files().get_media(fileId=file["id"]).execute()
with get_session_with_current_tenant() as db_session:
section, _ = store_image_and_create_section(
db_session=db_session,
image_data=response,
file_name=file["id"],
display_name=file_name,
media_type=mime_type,
llm=image_analysis_llm,
file_origin=FileOrigin.CONNECTOR,
)
return [section]
except Exception as e:
logger.warning(f"Failed to fetch or summarize image: {e}")
return [
Section(
link=link,
text="",
image_file_name=link,
)
]
if mime_type not in supported_file_types:
# Unsupported file types can still have a title, finding this way is still useful
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
link = file.get("webViewLink", "")
try:
# ---------------------------
# Google Sheets extraction
if mime_type == GDriveMimeType.SPREADSHEET.value:
# For Google Docs, Sheets, and Slides, export as plain text
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
# Use the correct API call for exporting files
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
if not response:
logger.warning(f"Failed to download {file_name}")
return []
# Process based on mime type
if mime_type == "text/plain":
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
text, _ = docx_to_text_and_images(io.BytesIO(response))
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
text = xlsx_to_text(io.BytesIO(response))
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
text = pptx_to_text(io.BytesIO(response))
return [TextSection(link=link, text=text)]
elif is_gdrive_image_mime_type(mime_type):
# For images, store them for later processing
sections: list[TextSection | ImageSection] = []
try:
sheets_service = build(
"sheets", "v4", credentials=service._http.credentials
)
spreadsheet = (
sheets_service.spreadsheets()
.get(spreadsheetId=file["id"])
.execute()
)
sections = []
for sheet in spreadsheet["sheets"]:
sheet_name = sheet["properties"]["title"]
sheet_id = sheet["properties"]["sheetId"]
# Get sheet dimensions
grid_properties = sheet["properties"].get("gridProperties", {})
row_count = grid_properties.get("rowCount", 1000)
column_count = grid_properties.get("columnCount", 26)
# Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
end_column = ""
while column_count:
column_count, remainder = divmod(column_count - 1, 26)
end_column = chr(65 + remainder) + end_column
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
try:
result = (
sheets_service.spreadsheets()
.values()
.get(spreadsheetId=file["id"], range=range_name)
.execute()
)
values = result.get("values", [])
if values:
text = f"Sheet: {sheet_name}\n"
for row in values:
text += "\t".join(str(cell) for cell in row) + "\n"
sections.append(
Section(
link=f"{link}#gid={sheet_id}",
text=text,
)
)
except HttpError as e:
logger.warning(
f"Error fetching data for sheet '{sheet_name}': {e}"
)
continue
return sections
except Exception as e:
logger.warning(
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
" Falling back to basic extraction."
)
# ---------------------------
# Microsoft Excel (.xlsx or .xls) extraction branch
elif mime_type in [
GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value,
GDriveMimeType.SPREADSHEET_MS_EXCEL.value,
]:
try:
response = service.files().get_media(fileId=file["id"]).execute()
with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp:
tmp.write(response)
tmp_path = tmp.name
section_separator = "\n\n"
workbook = openpyxl.load_workbook(tmp_path, read_only=True)
# Work similarly to the xlsx_to_text function used for file connector
# but returns Sections instead of a string
sections = [
Section(
link=link,
text=(
f"Sheet: {sheet.title}\n\n"
+ section_separator.join(
",".join(map(str, row))
for row in sheet.iter_rows(
min_row=1, values_only=True
)
if row
)
),
)
for sheet in workbook.worksheets
]
return sections
except Exception as e:
logger.warning(
f"Error extracting data from Excel file '{file['name']}': {e}"
)
return [
Section(link=link, text="Error extracting data from Excel file")
]
# ---------------------------
# Export for Google Docs, PPT, and fallback for spreadsheets
if mime_type in [
GDriveMimeType.DOC.value,
GDriveMimeType.PPT.value,
GDriveMimeType.SPREADSHEET.value,
]:
export_mime_type = (
"text/plain"
if mime_type != GDriveMimeType.SPREADSHEET.value
else "text/csv"
)
text = (
service.files()
.export(fileId=file["id"], mimeType=export_mime_type)
.execute()
.decode("utf-8")
)
return [Section(link=link, text=text)]
# ---------------------------
# Plain text and Markdown files
elif mime_type in [
GDriveMimeType.PLAIN_TEXT.value,
GDriveMimeType.MARKDOWN.value,
]:
text_data = (
service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
)
return [Section(link=link, text=text_data)]
# ---------------------------
# Word, PowerPoint, PDF files
elif mime_type in [
GDriveMimeType.WORD_DOC.value,
GDriveMimeType.POWERPOINT.value,
GDriveMimeType.PDF.value,
]:
response_bytes = service.files().get_media(fileId=file["id"]).execute()
# Optionally use Unstructured
if get_unstructured_api_key():
text = unstructured_to_text(
file=io.BytesIO(response_bytes),
file_name=file_name,
)
return [Section(link=link, text=text)]
if mime_type == GDriveMimeType.WORD_DOC.value:
# Use docx_to_text_and_images to get text plus embedded images
text, embedded_images = docx_to_text_and_images(
file=io.BytesIO(response_bytes),
)
sections = []
if text.strip():
sections.append(Section(link=link, text=text.strip()))
# Process each embedded image using the standardized function
with get_session_with_current_tenant() as db_session:
for idx, (img_data, img_name) in enumerate(
embedded_images, start=1
):
# Create a unique identifier for the embedded image
embedded_id = f"{file['id']}_embedded_{idx}"
section, embedded_id = store_image_and_create_section(
db_session=db_session,
image_data=response,
file_name=file_id,
display_name=file_name,
media_type=mime_type,
file_origin=FileOrigin.CONNECTOR,
link=link,
)
sections.append(section)
except Exception as e:
logger.error(f"Failed to process image {file_name}: {e}")
return sections
section, _ = store_image_and_create_section(
elif mime_type == "application/pdf":
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response))
pdf_sections: list[TextSection | ImageSection] = [
TextSection(link=link, text=text)
]
# Process embedded images in the PDF
try:
with get_session_with_current_tenant() as db_session:
for idx, (img_data, img_name) in enumerate(images):
section, embedded_id = store_image_and_create_section(
db_session=db_session,
image_data=img_data,
file_name=embedded_id,
file_name=f"{file_id}_img_{idx}",
display_name=img_name or f"{file_name} - image {idx}",
llm=image_analysis_llm,
file_origin=FileOrigin.CONNECTOR,
)
sections.append(section)
return sections
pdf_sections.append(section)
except Exception as e:
logger.error(f"Failed to process PDF images in {file_name}: {e}")
return pdf_sections
elif mime_type == GDriveMimeType.PDF.value:
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_bytes))
return [Section(link=link, text=text)]
elif mime_type == GDriveMimeType.POWERPOINT.value:
text_data = pptx_to_text(io.BytesIO(response_bytes))
return [Section(link=link, text=text_data)]
# Catch-all case, should not happen since there should be specific handling
# for each of the supported file types
error_message = f"Unsupported file type: {mime_type}"
logger.error(error_message)
raise ValueError(error_message)
else:
# For unsupported file types, try to extract text
try:
text = extract_file_text(io.BytesIO(response), file_name)
return [TextSection(link=link, text=text)]
except Exception as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
except Exception as e:
logger.exception(f"Error extracting sections from file: {e}")
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
logger.error(f"Error processing file {file_name}: {e}")
return []
def convert_drive_item_to_document(
file: GoogleDriveFileType,
drive_service: GoogleDriveService,
docs_service: GoogleDocsService,
image_analysis_llm: LLM | None,
) -> Document | None:
"""
Main entry point for converting a Google Drive file => Document object.
Now we accept an optional `llm` to pass to `_extract_sections_basic`.
"""
try:
# skip shortcuts or folders
@ -327,44 +215,50 @@ def convert_drive_item_to_document(
return None
# If it's a Google Doc, we might do advanced parsing
sections: list[Section] = []
sections: list[TextSection | ImageSection] = []
# Try to get sections using the advanced method first
if file.get("mimeType") == GDriveMimeType.DOC.value:
try:
# get_document_sections is the advanced approach for Google Docs
sections = get_document_sections(docs_service, file["id"])
doc_sections = get_document_sections(
docs_service=docs_service, doc_id=file.get("id", "")
)
if doc_sections:
sections = cast(list[TextSection | ImageSection], doc_sections)
except Exception as e:
logger.warning(
f"Failed to pull google doc sections from '{file['name']}': {e}. "
"Falling back to basic extraction."
f"Error in advanced parsing: {e}. Falling back to basic extraction."
)
# If not a doc, or if we failed above, do our 'basic' approach
# If we don't have sections yet, use the basic extraction method
if not sections:
sections = _extract_sections_basic(file, drive_service, image_analysis_llm)
sections = _extract_sections_basic(file, drive_service)
# If we still don't have any sections, skip this file
if not sections:
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
return None
doc_id = file["webViewLink"]
updated_time = datetime.fromisoformat(file["modifiedTime"]).astimezone(
timezone.utc
)
# Create the document
return Document(
id=doc_id,
sections=sections,
source=DocumentSource.GOOGLE_DRIVE,
semantic_identifier=file["name"],
doc_updated_at=updated_time,
metadata={}, # or any metadata from 'file'
additional_info=file.get("id"),
semantic_identifier=file.get("name", ""),
metadata={
"owner_names": ", ".join(
owner.get("displayName", "") for owner in file.get("owners", [])
),
},
doc_updated_at=datetime.fromisoformat(
file.get("modifiedTime", "").replace("Z", "+00:00")
),
)
except Exception as e:
logger.exception(f"Error converting file '{file.get('name')}' to Document: {e}")
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise
return None
logger.error(f"Error converting file {file.get('name')}: {e}")
return None
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:

View File

@ -3,7 +3,7 @@ from typing import Any
from pydantic import BaseModel
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
class CurrentHeading(BaseModel):
@ -37,7 +37,7 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
def get_document_sections(
docs_service: GoogleDocsService,
doc_id: str,
) -> list[Section]:
) -> list[TextSection]:
"""Extracts sections from a Google Doc, including their headings and content"""
# Fetch the document structure
doc = docs_service.documents().get(documentId=doc_id).execute()
@ -45,7 +45,7 @@ def get_document_sections(
# Get the content
content = doc.get("body", {}).get("content", [])
sections: list[Section] = []
sections: list[TextSection] = []
current_section: list[str] = []
current_heading: CurrentHeading | None = None
@ -70,7 +70,7 @@ def get_document_sections(
heading_text = current_heading.text
section_text = f"{heading_text}\n" + "\n".join(current_section)
sections.append(
Section(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)
@ -96,7 +96,7 @@ def get_document_sections(
if current_heading is not None and current_section:
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
sections.append(
Section(
TextSection(
text=section_text.strip(),
link=_build_gdoc_section_link(doc_id, current_heading.id),
)

View File

@ -12,7 +12,7 @@ from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.db.engine import get_sqlalchemy_engine
from onyx.file_processing.extract_file_text import load_files_from_zip
from onyx.file_processing.extract_file_text import read_text_file
@ -118,7 +118,7 @@ class GoogleSitesConnector(LoadConnector):
source=DocumentSource.GOOGLE_SITES,
semantic_identifier=title,
sections=[
Section(
TextSection(
link=(self.base_url.rstrip("/") + "/" + path.lstrip("/"))
if path
else "",

View File

@ -15,7 +15,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.logger import setup_logger
@ -120,7 +120,7 @@ class GuruConnector(LoadConnector, PollConnector):
doc_batch.append(
Document(
id=card["id"],
sections=[Section(link=link, text=content_text)],
sections=[TextSection(link=link, text=content_text)],
source=DocumentSource.GURU,
semantic_identifier=title,
doc_updated_at=latest_time,

View File

@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
HUBSPOT_BASE_URL = "https://app.hubspot.com/contacts/"
@ -108,7 +108,7 @@ class HubSpotConnector(LoadConnector, PollConnector):
doc_batch.append(
Document(
id=ticket.id,
sections=[Section(link=link, text=content_text)],
sections=[TextSection(link=link, text=content_text)],
source=DocumentSource.HUBSPOT,
semantic_identifier=title,
# Is already in tzutc, just replacing the timezone format

View File

@ -24,6 +24,8 @@ CheckpointOutput = Generator[Document | ConnectorFailure, None, ConnectorCheckpo
class BaseConnector(abc.ABC):
REDIS_KEY_PREFIX = "da_connector_data:"
# Common image file extensions supported across connectors
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
@abc.abstractmethod
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:

View File

@ -21,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import request_with_retries
@ -237,22 +238,30 @@ class LinearConnector(LoadConnector, PollConnector, OAuthConnector):
documents: list[Document] = []
for edge in edges:
node = edge["node"]
# Create sections for description and comments
sections = [
TextSection(
link=node["url"],
text=node["description"] or "",
)
]
# Add comment sections
for comment in node["comments"]["nodes"]:
sections.append(
TextSection(
link=node["url"],
text=comment["body"] or "",
)
)
# Cast the sections list to the expected type
typed_sections = cast(list[TextSection | ImageSection], sections)
documents.append(
Document(
id=node["id"],
sections=[
Section(
link=node["url"],
text=node["description"] or "",
)
]
+ [
Section(
link=node["url"],
text=comment["body"] or "",
)
for comment in node["comments"]["nodes"]
],
sections=typed_sections,
source=DocumentSource.LINEAR,
semantic_identifier=f"[{node['identifier']}] {node['title']}",
title=node["title"],

View File

@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.file_processing.html_utils import strip_excessive_newlines_and_spaces
from onyx.utils.logger import setup_logger
@ -162,7 +162,7 @@ class LoopioConnector(LoadConnector, PollConnector):
doc_batch.append(
Document(
id=str(entry["id"]),
sections=[Section(link=link, text=content_text)],
sections=[TextSection(link=link, text=content_text)],
source=DocumentSource.LOOPIO,
semantic_identifier=questions[0],
doc_updated_at=latest_time,

View File

@ -6,6 +6,7 @@ import tempfile
from collections.abc import Generator
from collections.abc import Iterator
from typing import Any
from typing import cast
from typing import ClassVar
import pywikibot.time # type: ignore[import-untyped]
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.mediawiki.family import family_class_dispatch
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
@ -60,14 +62,14 @@ def get_doc_from_page(
sections_extracted: textlib.Content = textlib.extract_sections(page_text, site)
sections = [
Section(
TextSection(
link=f"{page.full_url()}#" + section.heading.replace(" ", "_"),
text=section.title + section.content,
)
for section in sections_extracted.sections
]
sections.append(
Section(
TextSection(
link=page.full_url(),
text=sections_extracted.header,
)
@ -79,7 +81,7 @@ def get_doc_from_page(
doc_updated_at=pywikibot_timestamp_to_utc_datetime(
page.latest_revision.timestamp
),
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
semantic_identifier=page.title(),
metadata={"categories": [category.title() for category in page.categories()]},
id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}",

View File

@ -28,9 +28,25 @@ class ConnectorMissingCredentialError(PermissionError):
class Section(BaseModel):
"""Base section class with common attributes"""
link: str | None = None
text: str | None = None
image_file_name: str | None = None
class TextSection(Section):
"""Section containing text content"""
text: str
link: str | None = None
image_file_name: str | None = None
class ImageSection(Section):
"""Section containing an image reference"""
image_file_name: str
link: str | None = None
class BasicExpertInfo(BaseModel):
@ -100,7 +116,7 @@ class DocumentBase(BaseModel):
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
id: str | None = None
sections: list[Section]
sections: list[TextSection | ImageSection]
source: DocumentSource | None = None
semantic_identifier: str # displayed in the UI as the main identifier for the doc
metadata: dict[str, str | list[str]]
@ -150,18 +166,10 @@ class DocumentBase(BaseModel):
class Document(DocumentBase):
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
source: DocumentSource
"""Used for Onyx ingestion api, the ID is required"""
def get_total_char_length(self) -> int:
"""Calculate the total character length of the document including sections, metadata, and identifiers."""
section_length = sum(len(section.text) for section in self.sections)
identifier_length = len(self.semantic_identifier) + len(self.title or "")
metadata_length = sum(
len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
for k, v in self.metadata.items()
)
return section_length + identifier_length + metadata_length
id: str
source: DocumentSource
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a document"""
@ -185,6 +193,32 @@ class Document(DocumentBase):
)
class IndexingDocument(Document):
"""Document with processed sections for indexing"""
processed_sections: list[Section] = []
def get_total_char_length(self) -> int:
"""Get the total character length of the document including processed sections"""
title_len = len(self.title or self.semantic_identifier)
# Use processed_sections if available, otherwise fall back to original sections
if self.processed_sections:
section_len = sum(
len(section.text) if section.text is not None else 0
for section in self.processed_sections
)
else:
section_len = sum(
len(section.text)
if isinstance(section, TextSection) and section.text is not None
else 0
for section in self.sections
)
return title_len + section_len
class SlimDocument(BaseModel):
id: str
perm_sync_data: Any | None = None

View File

@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.batching import batch_generator
from onyx.utils.logger import setup_logger
@ -475,7 +475,7 @@ class NotionConnector(LoadConnector, PollConnector):
Document(
id=page.id,
sections=[
Section(
TextSection(
link=f"{page.url}#{block.id.replace('-', '')}",
text=block.prefix + block.text,
)

View File

@ -23,8 +23,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.connectors.onyx_jira.utils import best_effort_basic_expert_info
from onyx.connectors.onyx_jira.utils import best_effort_get_field_from_issue
from onyx.connectors.onyx_jira.utils import build_jira_client
@ -145,7 +145,7 @@ def fetch_jira_issues_batch(
yield Document(
id=page_url,
sections=[Section(link=page_url, text=ticket_content)],
sections=[TextSection(link=page_url, text=ticket_content)],
source=DocumentSource.JIRA,
semantic_identifier=f"{issue.key}: {issue.fields.summary}",
title=f"{issue.key} {issue.fields.summary}",

View File

@ -16,7 +16,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
@ -110,7 +110,7 @@ class ProductboardConnector(PollConnector):
yield Document(
id=feature["id"],
sections=[
Section(
TextSection(
link=feature["links"]["html"],
text=self._parse_description_html(feature["description"]),
)
@ -133,7 +133,7 @@ class ProductboardConnector(PollConnector):
yield Document(
id=component["id"],
sections=[
Section(
TextSection(
link=component["links"]["html"],
text=self._parse_description_html(component["description"]),
)
@ -159,7 +159,7 @@ class ProductboardConnector(PollConnector):
yield Document(
id=product["id"],
sections=[
Section(
TextSection(
link=product["links"]["html"],
text=self._parse_description_html(product["description"]),
)
@ -189,7 +189,7 @@ class ProductboardConnector(PollConnector):
yield Document(
id=objective["id"],
sections=[
Section(
TextSection(
link=objective["links"]["html"],
text=self._parse_description_html(objective["description"]),
)

View File

@ -13,6 +13,7 @@ from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.connectors.salesforce.doc_conversion import convert_sf_object_to_doc
from onyx.connectors.salesforce.doc_conversion import ID_PREFIX
from onyx.connectors.salesforce.salesforce_calls import fetch_all_csvs_in_parallel
@ -218,7 +219,8 @@ if __name__ == "__main__":
for doc in doc_batch:
section_count += len(doc.sections)
for section in doc.sections:
text_count += len(section.text)
if isinstance(section, TextSection) and section.text is not None:
text_count += len(section.text)
end_time = time.time()
print(f"Doc count: {doc_count}")

View File

@ -1,10 +1,12 @@
import re
from typing import cast
from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.connectors.salesforce.sqlite_functions import get_child_ids
from onyx.connectors.salesforce.sqlite_functions import get_record
from onyx.connectors.salesforce.utils import SalesforceObject
@ -114,8 +116,8 @@ def _extract_dict_text(raw_dict: dict) -> str:
return natural_language_for_dict
def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Section:
return Section(
def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> TextSection:
return TextSection(
text=_extract_dict_text(salesforce_object.data),
link=f"{base_url}/{salesforce_object.id}",
)
@ -175,7 +177,7 @@ def convert_sf_object_to_doc(
doc = Document(
id=onyx_salesforce_id,
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.SALESFORCE,
semantic_identifier=extracted_semantic_identifier,
doc_updated_at=extracted_doc_updated_at,

View File

@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.utils.logger import setup_logger
@ -55,7 +55,7 @@ def _convert_driveitem_to_document(
doc = Document(
id=driveitem.id,
sections=[Section(link=driveitem.web_url, text=file_text)],
sections=[TextSection(link=driveitem.web_url, text=file_text)],
source=DocumentSource.SHAREPOINT,
semantic_identifier=driveitem.name,
doc_updated_at=driveitem.last_modified_datetime.replace(tzinfo=timezone.utc),

View File

@ -19,8 +19,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
@ -212,7 +212,7 @@ class SlabConnector(LoadConnector, PollConnector, SlimConnector):
doc_batch.append(
Document(
id=post_id, # can't be url as this changes with the post title
sections=[Section(link=page_url, text=content_text)],
sections=[TextSection(link=page_url, text=content_text)],
source=DocumentSource.SLAB,
semantic_identifier=post["title"],
metadata={},

View File

@ -34,8 +34,8 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import EntityFailure
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.connectors.slack.utils import expert_info_from_slack_id
from onyx.connectors.slack.utils import get_message_link
from onyx.connectors.slack.utils import make_paginated_slack_api_call_w_retries
@ -211,7 +211,7 @@ def thread_to_doc(
return Document(
id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
sections=[
Section(
TextSection(
link=get_message_link(event=m, client=client, channel_id=channel_id),
text=slack_cleaner.index_clean(cast(str, m["text"])),
)

View File

@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.logger import setup_logger
@ -165,7 +165,7 @@ def _convert_thread_to_document(
doc = Document(
id=post_id,
sections=[Section(link=web_url, text=thread_text)],
sections=[TextSection(link=web_url, text=thread_text)],
source=DocumentSource.TEAMS,
semantic_identifier=semantic_string,
title="", # teams threads don't really have a "title"

View File

@ -1,46 +0,0 @@
"""
Mixin for connectors that need vision capabilities.
"""
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
from onyx.llm.factory import get_default_llm_with_vision
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
class VisionEnabledConnector:
"""
Mixin for connectors that need vision capabilities.
This mixin provides a standard way to initialize a vision-capable LLM
for image analysis during indexing.
Usage:
class MyConnector(LoadConnector, VisionEnabledConnector):
def __init__(self, ...):
super().__init__(...)
self.initialize_vision_llm()
"""
def initialize_vision_llm(self) -> None:
"""
Initialize a vision-capable LLM if enabled by configuration.
Sets self.image_analysis_llm to the LLM instance or None if disabled.
"""
self.image_analysis_llm: LLM | None = None
if get_image_extraction_and_analysis_enabled():
try:
self.image_analysis_llm = get_default_llm_with_vision()
if self.image_analysis_llm is None:
logger.warning(
"No LLM with vision found; image summarization will be disabled"
)
except Exception as e:
logger.warning(
f"Failed to initialize vision LLM due to an error: {str(e)}. "
"Image summarization will be disabled."
)
self.image_analysis_llm = None

View File

@ -32,7 +32,7 @@ from onyx.connectors.exceptions import UnexpectedValidationError
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.html_utils import web_html_cleanup
from onyx.utils.logger import setup_logger
@ -341,7 +341,7 @@ class WebConnector(LoadConnector):
doc_batch.append(
Document(
id=initial_url,
sections=[Section(link=initial_url, text=page_text)],
sections=[TextSection(link=initial_url, text=page_text)],
source=DocumentSource.WEB,
semantic_identifier=initial_url.split("/")[-1],
metadata=metadata,
@ -443,7 +443,7 @@ class WebConnector(LoadConnector):
Document(
id=initial_url,
sections=[
Section(link=initial_url, text=parsed_html.cleaned_text)
TextSection(link=initial_url, text=parsed_html.cleaned_text)
],
source=DocumentSource.WEB,
semantic_identifier=parsed_html.title or initial_url,

View File

@ -28,7 +28,7 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -104,7 +104,7 @@ def scrape_page_posts(
# id. We may want to de-dupe this stuff inside the indexing service.
document = Document(
id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}",
sections=[Section(link=url, text=post_text)],
sections=[TextSection(link=url, text=post_text)],
title=title,
source=DocumentSource.XENFORO,
semantic_identifier=title,

View File

@ -1,5 +1,6 @@
from collections.abc import Iterator
from typing import Any
from typing import cast
import requests
@ -17,8 +18,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.retry_wrapper import retry_builder
@ -168,8 +169,8 @@ def _article_to_document(
return new_author_mapping, Document(
id=f"article:{article['id']}",
sections=[
Section(
link=article.get("html_url"),
TextSection(
link=cast(str, article.get("html_url")),
text=parse_html_page_basic(article["body"]),
)
],
@ -268,7 +269,7 @@ def _ticket_to_document(
return new_author_mapping, Document(
id=f"zendesk_ticket_{ticket['id']}",
sections=[Section(link=ticket_display_url, text=full_text)],
sections=[TextSection(link=ticket_display_url, text=full_text)],
source=DocumentSource.ZENDESK,
semantic_identifier=f"Ticket #{ticket['id']}: {subject or 'No Subject'}",
doc_updated_at=update_time,

View File

@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.connectors.zulip.schemas import GetMessagesResponse
from onyx.connectors.zulip.schemas import Message
from onyx.connectors.zulip.utils import build_search_narrow
@ -161,7 +161,7 @@ class ZulipConnector(LoadConnector, PollConnector):
return Document(
id=f"{message.stream_id}__{message.id}",
sections=[
Section(
TextSection(
link=self._message_to_narrow_link(message),
text=text,
)

View File

@ -67,6 +67,9 @@ def read_lobj(
use_tempfile: bool = False,
) -> IO:
pg_conn = get_pg_conn_from_session(db_session)
# Ensure we're using binary mode by default for large objects
if mode is None:
mode = "rb"
large_object = (
pg_conn.lobject(lobj_oid, mode=mode) if mode else pg_conn.lobject(lobj_oid)
)
@ -81,6 +84,7 @@ def read_lobj(
temp_file.seek(0)
return temp_file
else:
# Ensure we're getting raw bytes without text decoding
return BytesIO(large_object.read())

View File

@ -2,12 +2,9 @@ from typing import Tuple
from sqlalchemy.orm import Session
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -18,12 +15,12 @@ def store_image_and_create_section(
image_data: bytes,
file_name: str,
display_name: str,
media_type: str = "image/unknown",
llm: LLM | None = None,
link: str | None = None,
media_type: str = "application/octet-stream",
file_origin: FileOrigin = FileOrigin.OTHER,
) -> Tuple[Section, str | None]:
) -> Tuple[ImageSection, str | None]:
"""
Stores an image in PGFileStore and creates a Section object with optional summarization.
Stores an image in PGFileStore and creates an ImageSection object without summarization.
Args:
db_session: Database session
@ -31,12 +28,11 @@ def store_image_and_create_section(
file_name: Base identifier for the file
display_name: Human-readable name for the image
media_type: MIME type of the image
llm: Optional LLM with vision capabilities for summarization
file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
Returns:
Tuple containing:
- Section object with image reference and optional summary text
- ImageSection object with image reference
- The file_name in PGFileStore or None if storage failed
"""
# Storage logic
@ -53,18 +49,10 @@ def store_image_and_create_section(
stored_file_name = pgfilestore.file_name
except Exception as e:
logger.error(f"Failed to store image: {e}")
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise
return Section(text=""), None
# Summarization logic
summary_text = ""
if llm:
summary_text = (
summarize_image_with_error_handling(llm, image_data, display_name) or ""
)
raise e
# Create an ImageSection with empty text (will be filled by LLM later in the pipeline)
return (
Section(text=summary_text, image_file_name=stored_file_name),
ImageSection(image_file_name=stored_file_name, link=link),
stored_file_name,
)

View File

@ -9,7 +9,8 @@ from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_metadata_keys_to_ignore,
)
from onyx.connectors.models import Document
from onyx.connectors.models import IndexingDocument
from onyx.connectors.models import Section
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.models import DocAwareChunk
from onyx.natural_language_processing.utils import BaseTokenizer
@ -64,7 +65,7 @@ def _get_metadata_suffix_for_document_index(
def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwareChunk:
"""
Combines multiple DocAwareChunks into one large chunk (for multipass mode),
Combines multiple DocAwareChunks into one large chunk (for "multipass" mode),
appending the content and adjusting source_links accordingly.
"""
merged_chunk = DocAwareChunk(
@ -98,7 +99,7 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar
def generate_large_chunks(chunks: list[DocAwareChunk]) -> list[DocAwareChunk]:
"""
Generates larger grouped chunks by combining sets of smaller chunks.
Generates larger "grouped" chunks by combining sets of smaller chunks.
"""
large_chunks = []
for idx, i in enumerate(range(0, len(chunks), LARGE_CHUNK_RATIO)):
@ -185,7 +186,7 @@ class Chunker:
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
"""
For multipass mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
"""
if self.mini_chunk_splitter and chunk_text.strip():
return self.mini_chunk_splitter.split_text(chunk_text)
@ -194,7 +195,7 @@ class Chunker:
# ADDED: extra param image_url to store in the chunk
def _create_chunk(
self,
document: Document,
document: IndexingDocument,
chunks_list: list[DocAwareChunk],
text: str,
links: dict[int, str],
@ -225,7 +226,29 @@ class Chunker:
def _chunk_document(
self,
document: Document,
document: IndexingDocument,
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
content_token_limit: int,
) -> list[DocAwareChunk]:
"""
Legacy method for backward compatibility.
Calls _chunk_document_with_sections with document.sections.
"""
return self._chunk_document_with_sections(
document,
document.processed_sections,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
content_token_limit,
)
def _chunk_document_with_sections(
self,
document: IndexingDocument,
sections: list[Section],
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
@ -233,17 +256,16 @@ class Chunker:
) -> list[DocAwareChunk]:
"""
Loops through sections of the document, converting them into one or more chunks.
If a section has an image_link, we treat it as a dedicated chunk.
Works with processed sections that are base Section objects.
"""
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
for section_idx, section in enumerate(document.sections):
section_text = clean_text(section.text)
for section_idx, section in enumerate(sections):
# Get section text and other attributes
section_text = clean_text(section.text or "")
section_link_text = section.link or ""
# ADDED: if the Section has an image link
image_url = section.image_file_name
# If there is no useful content, skip
@ -254,7 +276,7 @@ class Chunker:
)
continue
# CASE 1: If this is an image section, force a separate chunk
# CASE 1: If this section has an image, force a separate chunk
if image_url:
# First, if we have any partially built text chunk, finalize it
if chunk_text.strip():
@ -271,15 +293,13 @@ class Chunker:
chunk_text = ""
link_offsets = {}
# Create a chunk specifically for this image
# (If the section has text describing the image, use that as content)
# Create a chunk specifically for this image section
# (Using the text summary that was generated during processing)
self._create_chunk(
document,
chunks,
section_text,
links={0: section_link_text}
if section_link_text
else {}, # No text offsets needed for images
links={0: section_link_text} if section_link_text else {},
image_file_name=image_url,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
@ -384,7 +404,9 @@ class Chunker:
)
return chunks
def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
def _handle_single_document(
self, document: IndexingDocument
) -> list[DocAwareChunk]:
# Specifically for reproducing an issue with gmail
if document.source == DocumentSource.GMAIL:
logger.debug(f"Chunking {document.semantic_identifier}")
@ -420,26 +442,31 @@ class Chunker:
title_prefix = ""
metadata_suffix_semantic = ""
# Chunk the document
normal_chunks = self._chunk_document(
# Use processed_sections if available (IndexingDocument), otherwise use original sections
sections_to_chunk = document.processed_sections
normal_chunks = self._chunk_document_with_sections(
document,
sections_to_chunk,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
content_token_limit,
)
# Optional “multipass” large chunk creation
# Optional "multipass" large chunk creation
if self.enable_multipass and self.enable_large_chunks:
large_chunks = generate_large_chunks(normal_chunks)
normal_chunks.extend(large_chunks)
return normal_chunks
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]:
"""
Takes in a list of documents and chunks them into smaller chunks for indexing
while persisting the document metadata.
Works with both standard Document objects and IndexingDocument objects with processed_sections.
"""
final_chunks: list[DocAwareChunk] = []
for document in documents:

View File

@ -10,13 +10,18 @@ from onyx.access.access import get_access_for_documents
from onyx.access.models import DocumentAccess
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
from onyx.configs.constants import DEFAULT_BOOST
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import ImageSection
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import IndexingDocument
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.db.document import fetch_chunk_counts_for_documents
from onyx.db.document import get_documents_by_ids
from onyx.db.document import mark_document_as_indexed_for_cc_pair__no_commit
@ -27,7 +32,10 @@ from onyx.db.document import update_docs_updated_at__no_commit
from onyx.db.document import upsert_document_by_connector_credential_pair
from onyx.db.document import upsert_documents
from onyx.db.document_set import fetch_document_sets_for_documents
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import Document as DBDocument
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
from onyx.db.pg_file_store import read_lobj
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag
from onyx.db.tag import create_or_add_document_tag_list
@ -37,6 +45,7 @@ from onyx.document_index.document_index_utils import (
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import embed_chunks_with_failure_handling
from onyx.indexing.embedder import IndexingEmbedder
@ -44,6 +53,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
from onyx.llm.factory import get_default_llm_with_vision
from onyx.utils.logger import setup_logger
from onyx.utils.timing import log_function_time
@ -53,6 +63,7 @@ logger = setup_logger()
class DocumentBatchPrepareContext(BaseModel):
updatable_docs: list[Document]
id_to_db_doc_map: dict[str, DBDocument]
indexable_docs: list[IndexingDocument] = []
model_config = ConfigDict(arbitrary_types_allowed=True)
@ -265,7 +276,12 @@ def index_doc_batch_prepare(
def filter_documents(document_batch: list[Document]) -> list[Document]:
documents: list[Document] = []
for document in document_batch:
empty_contents = not any(section.text.strip() for section in document.sections)
empty_contents = not any(
isinstance(section, TextSection)
and section.text is not None
and section.text.strip()
for section in document.sections
)
if (
(not document.title or not document.title.strip())
and not document.semantic_identifier.strip()
@ -288,7 +304,12 @@ def filter_documents(document_batch: list[Document]) -> list[Document]:
)
continue
section_chars = sum(len(section.text) for section in document.sections)
section_chars = sum(
len(section.text)
if isinstance(section, TextSection) and section.text is not None
else 0
for section in document.sections
)
if (
MAX_DOCUMENT_CHARS
and len(document.title or document.semantic_identifier) + section_chars
@ -308,6 +329,121 @@ def filter_documents(document_batch: list[Document]) -> list[Document]:
return documents
def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
"""
Process all sections in documents by:
1. Converting both TextSection and ImageSection objects to base Section objects
2. Processing ImageSections to generate text summaries using a vision-capable LLM
3. Returning IndexingDocument objects with both original and processed sections
Args:
documents: List of documents with TextSection | ImageSection objects
Returns:
List of IndexingDocument objects with processed_sections as list[Section]
"""
# Check if image extraction and analysis is enabled before trying to get a vision LLM
if not get_image_extraction_and_analysis_enabled():
llm = None
else:
# Only get the vision LLM if image processing is enabled
llm = get_default_llm_with_vision()
if not llm:
logger.warning(
"No vision-capable LLM available. Image sections will not be processed."
)
# Even without LLM, we still convert to IndexingDocument with base Sections
return [
IndexingDocument(
**document.dict(),
processed_sections=[
Section(
text=section.text if isinstance(section, TextSection) else None,
link=section.link,
image_file_name=section.image_file_name
if isinstance(section, ImageSection)
else None,
)
for section in document.sections
],
)
for document in documents
]
indexed_documents: list[IndexingDocument] = []
for document in documents:
processed_sections: list[Section] = []
for section in document.sections:
# For ImageSection, process and create base Section with both text and image_file_name
if isinstance(section, ImageSection):
# Default section with image path preserved
processed_section = Section(
link=section.link,
image_file_name=section.image_file_name,
text=None, # Will be populated if summarization succeeds
)
# Try to get image summary
try:
with get_session_with_current_tenant() as db_session:
pgfilestore = get_pgfilestore_by_file_name(
file_name=section.image_file_name, db_session=db_session
)
if not pgfilestore:
logger.warning(
f"Image file {section.image_file_name} not found in PGFileStore"
)
processed_section.text = "[Image could not be processed]"
else:
# Get the image data
image_data_io = read_lobj(
pgfilestore.lobj_oid, db_session, mode="rb"
)
pgfilestore_data = image_data_io.read()
summary = summarize_image_with_error_handling(
llm=llm,
image_data=pgfilestore_data,
context_name=pgfilestore.display_name or "Image",
)
if summary:
processed_section.text = summary
else:
processed_section.text = (
"[Image could not be summarized]"
)
except Exception as e:
logger.error(f"Error processing image section: {e}")
processed_section.text = "[Error processing image]"
processed_sections.append(processed_section)
# For TextSection, create a base Section with text and link
elif isinstance(section, TextSection):
processed_section = Section(
text=section.text, link=section.link, image_file_name=None
)
processed_sections.append(processed_section)
# If it's already a base Section (unlikely), just append it
else:
processed_sections.append(section)
# Create IndexingDocument with original sections and processed_sections
indexed_document = IndexingDocument(
**document.dict(), processed_sections=processed_sections
)
indexed_documents.append(indexed_document)
return indexed_documents
@log_function_time(debug_only=True)
def index_doc_batch(
*,
@ -362,19 +498,23 @@ def index_doc_batch(
failures=[],
)
# Convert documents to IndexingDocument objects with processed section
# logger.debug("Processing image sections")
ctx.indexable_docs = process_image_sections(ctx.updatable_docs)
doc_descriptors = [
{
"doc_id": doc.id,
"doc_length": doc.get_total_char_length(),
}
for doc in ctx.updatable_docs
for doc in ctx.indexable_docs
]
logger.debug(f"Starting indexing process for documents: {doc_descriptors}")
logger.debug("Starting chunking")
# NOTE: no special handling for failures here, since the chunker is not
# a common source of failure for the indexing pipeline
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
chunks: list[DocAwareChunk] = chunker.chunk(ctx.indexable_docs)
logger.debug("Starting embedding")
chunks_with_embeddings, embedding_failures = (

View File

@ -15,7 +15,7 @@ from onyx.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import InputType
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.db.connector import check_connectors_exist
from onyx.db.connector import create_connector
from onyx.db.connector_credential_pair import add_credential_to_connector
@ -55,7 +55,7 @@ def _create_indexable_chunks(
# The section is not really used past this point since we have already done the other processing
# for the chunking and embedding.
sections=[
Section(
TextSection(
text=preprocessed_doc["content"],
link=preprocessed_doc["url"],
image_file_name=None,

View File

@ -1,4 +1,5 @@
import os
from typing import cast
from unittest.mock import MagicMock
import pytest
@ -7,7 +8,8 @@ from pydantic import BaseModel
from onyx.configs.constants import DocumentSource
from onyx.connectors.airtable.airtable_connector import AirtableConnector
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
BASE_VIEW_ID = "viwVUEJjWPd8XYjh8"
@ -76,11 +78,11 @@ def create_test_document(
if not all_fields_as_metadata:
sections.extend(
[
Section(
TextSection(
text=f"Title:\n------------------------\n{title}\n------------------------",
link=f"{link_base}/{id}",
),
Section(
TextSection(
text=f"Description:\n------------------------\n{description}\n------------------------",
link=f"{link_base}/{id}",
),
@ -90,7 +92,7 @@ def create_test_document(
if attachments:
for attachment_text, attachment_link in attachments:
sections.append(
Section(
TextSection(
text=f"Attachment:\n------------------------\n{attachment_text}\n------------------------",
link=attachment_link,
),
@ -122,7 +124,7 @@ def create_test_document(
return Document(
id=f"airtable__{id}",
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.AIRTABLE,
semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
metadata=metadata,

View File

@ -49,6 +49,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
# Content specific checks
content = section.text
assert content is not None, "Section text should not be None"
# Check for specific content elements
assert "* Fruit Shopping List:" in content
@ -82,6 +83,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
assert nested1.semantic_identifier == "Nested1"
assert len(nested1.sections) == 1
# extra newlines at the end, remove them to make test easier
assert nested1.sections[0].text is not None
assert nested1.sections[0].text.strip() == "nested1"
assert nested1.source == DocumentSource.GITBOOK
@ -89,6 +91,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
assert nested2.id.startswith("gitbook-")
assert nested2.semantic_identifier == "Nested2"
assert len(nested2.sections) == 1
assert nested2.sections[0].text is not None
assert nested2.sections[0].text.strip() == "nested2"
assert nested2.source == DocumentSource.GITBOOK

View File

@ -1,6 +1,7 @@
from collections.abc import Sequence
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
ALL_FILES = list(range(0, 60))
SHARED_DRIVE_FILES = list(range(20, 25))
@ -177,7 +178,13 @@ def assert_retrieved_docs_match_expected(
)
valid_retrieved_texts = set(
[
" - ".join([section.text for section in doc.sections])
" - ".join(
[
section.text
for section in doc.sections
if isinstance(section, TextSection) and section.text is not None
]
)
for doc in valid_retrieved_docs
]
)

View File

@ -97,6 +97,7 @@ def test_notion_connector_basic(notion_connector: NotionConnector) -> None:
assert child2_section.link.startswith("https://www.notion.so/")
# Database section checks for child2
assert child2_db_section.text is not None
assert child2_db_section.text.strip() != "" # Should contain some database content
assert child2_db_section.link is not None
assert child2_db_section.link.startswith("https://www.notion.so/")

View File

@ -60,6 +60,7 @@ def verify_document_content(doc: Document, expected: ExpectedDocument) -> None:
"""Verify a document matches its expected content."""
assert doc.semantic_identifier == expected.semantic_identifier
assert len(doc.sections) == 1
assert doc.sections[0].text is not None
assert expected.content in doc.sections[0].text
verify_document_metadata(doc)

View File

@ -63,6 +63,7 @@ def test_slab_connector_basic(slab_connector: SlabConnector) -> None:
assert len(target_test_doc.sections) == 1
section = target_test_doc.sections[0]
# Need to replace the weird apostrophe with a normal one
assert section.text is not None
assert section.text.replace("\u2019", "'") == desired_test_data["section_text"]
assert section.link == desired_test_data["link"]

View File

@ -32,6 +32,7 @@ def test_web_connector_scroll(web_connector: WebConnector) -> None:
assert len(all_docs) == 1
doc = all_docs[0]
assert doc.sections[0].text is not None
assert EXPECTED_QUOTE in doc.sections[0].text
@ -45,4 +46,5 @@ def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
assert len(all_docs) == 1
doc = all_docs[0]
assert doc.sections[0].text is not None
assert EXPECTED_QUOTE not in doc.sections[0].text

View File

@ -6,7 +6,7 @@ from onyx.configs.constants import DocumentSource
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
def create_test_document(
@ -28,7 +28,7 @@ def create_test_document(
doc_id = doc_id or f"test-doc-{uuid.uuid4()}"
return Document(
id=doc_id,
sections=[Section(text=text, link=link)],
sections=[TextSection(text=text, link=link)],
source=source,
semantic_identifier=doc_id,
doc_updated_at=datetime.now(timezone.utc),

View File

@ -83,6 +83,7 @@ def test_fetch_jira_issues_batch_small_ticket(
assert len(docs) == 1
assert docs[0].id.endswith("/SMALL-1")
assert docs[0].sections[0].text is not None
assert "Small description" in docs[0].sections[0].text
assert "Small comment 1" in docs[0].sections[0].text
assert "Small comment 2" in docs[0].sections[0].text

View File

@ -99,7 +99,8 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None:
doc.sections, test_page._sections_helper + [test_page.header]
):
assert (
section.text.strip() == expected_section.strip()
section.text is not None
and section.text.strip() == expected_section.strip()
) # Extra whitespace before/after is okay
assert section.link and section.link.startswith(test_page.full_url())
assert doc.semantic_identifier == test_page.title()

View File

@ -2,9 +2,10 @@ import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.indexing_pipeline import process_image_sections
from tests.unit.onyx.indexing.conftest import MockHeartbeat
@ -35,19 +36,20 @@ def test_chunk_document(embedder: DefaultIndexingEmbedder) -> None:
metadata={"tags": ["tag1", "tag2"]},
doc_updated_at=None,
sections=[
Section(text=short_section_1, link="link1"),
Section(text=short_section_2, link="link2"),
Section(text=long_section, link="link3"),
Section(text=short_section_3, link="link4"),
Section(text=short_section_4, link="link5"),
TextSection(text=short_section_1, link="link1"),
TextSection(text=short_section_2, link="link2"),
TextSection(text=long_section, link="link3"),
TextSection(text=short_section_3, link="link4"),
TextSection(text=short_section_4, link="link5"),
],
)
indexing_documents = process_image_sections([document])
chunker = Chunker(
tokenizer=embedder.embedding_model.tokenizer,
enable_multipass=False,
)
chunks = chunker.chunk([document])
chunks = chunker.chunk(indexing_documents)
assert len(chunks) == 5
assert short_section_1 in chunks[0].content
@ -67,9 +69,10 @@ def test_chunker_heartbeat(
metadata={"tags": ["tag1", "tag2"]},
doc_updated_at=None,
sections=[
Section(text="This is a short section.", link="link1"),
TextSection(text="This is a short section.", link="link1"),
],
)
indexing_documents = process_image_sections([document])
chunker = Chunker(
tokenizer=embedder.embedding_model.tokenizer,
@ -77,7 +80,7 @@ def test_chunker_heartbeat(
callback=mock_heartbeat,
)
chunks = chunker.chunk([document])
chunks = chunker.chunk(indexing_documents)
assert mock_heartbeat.call_count == 1
assert len(chunks) > 0

View File

@ -6,7 +6,7 @@ import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import TextSection
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocAwareChunk
@ -45,7 +45,7 @@ def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> N
metadata={"tags": ["tag1", "tag2"]},
doc_updated_at=None,
sections=[
Section(text="This is a short section.", link="link1"),
TextSection(text="This is a short section.", link="link1"),
],
)
chunks: list[DocAwareChunk] = [

View File

@ -1,9 +1,11 @@
from typing import cast
from typing import List
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentSource
from onyx.connectors.models import Section
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection
from onyx.indexing.indexing_pipeline import filter_documents
@ -11,15 +13,15 @@ def create_test_document(
doc_id: str = "test_id",
title: str | None = "Test Title",
semantic_id: str = "test_semantic_id",
sections: List[Section] | None = None,
sections: List[TextSection] | None = None,
) -> Document:
if sections is None:
sections = [Section(text="Test content", link="test_link")]
sections = [TextSection(text="Test content", link="test_link")]
return Document(
id=doc_id,
title=title,
semantic_identifier=semantic_id,
sections=sections,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.FILE,
metadata={},
)
@ -27,7 +29,7 @@ def create_test_document(
def test_filter_documents_empty_title_and_content() -> None:
doc = create_test_document(
title="", semantic_id="", sections=[Section(text="", link="test_link")]
title="", semantic_id="", sections=[TextSection(text="", link="test_link")]
)
result = filter_documents([doc])
assert len(result) == 0
@ -35,7 +37,7 @@ def test_filter_documents_empty_title_and_content() -> None:
def test_filter_documents_empty_title_with_content() -> None:
doc = create_test_document(
title="", sections=[Section(text="Valid content", link="test_link")]
title="", sections=[TextSection(text="Valid content", link="test_link")]
)
result = filter_documents([doc])
assert len(result) == 1
@ -44,7 +46,7 @@ def test_filter_documents_empty_title_with_content() -> None:
def test_filter_documents_empty_content_with_title() -> None:
doc = create_test_document(
title="Valid Title", sections=[Section(text="", link="test_link")]
title="Valid Title", sections=[TextSection(text="", link="test_link")]
)
result = filter_documents([doc])
assert len(result) == 1
@ -55,14 +57,15 @@ def test_filter_documents_exceeding_max_chars() -> None:
if not MAX_DOCUMENT_CHARS: # Skip if no max chars configured
return
long_text = "a" * (MAX_DOCUMENT_CHARS + 1)
doc = create_test_document(sections=[Section(text=long_text, link="test_link")])
doc = create_test_document(sections=[TextSection(text=long_text, link="test_link")])
result = filter_documents([doc])
assert len(result) == 0
def test_filter_documents_valid_document() -> None:
doc = create_test_document(
title="Valid Title", sections=[Section(text="Valid content", link="test_link")]
title="Valid Title",
sections=[TextSection(text="Valid content", link="test_link")],
)
result = filter_documents([doc])
assert len(result) == 1
@ -72,7 +75,9 @@ def test_filter_documents_valid_document() -> None:
def test_filter_documents_whitespace_only() -> None:
doc = create_test_document(
title=" ", semantic_id=" ", sections=[Section(text=" ", link="test_link")]
title=" ",
semantic_id=" ",
sections=[TextSection(text=" ", link="test_link")],
)
result = filter_documents([doc])
assert len(result) == 0
@ -82,7 +87,7 @@ def test_filter_documents_semantic_id_no_title() -> None:
doc = create_test_document(
title=None,
semantic_id="Valid Semantic ID",
sections=[Section(text="Valid content", link="test_link")],
sections=[TextSection(text="Valid content", link="test_link")],
)
result = filter_documents([doc])
assert len(result) == 1
@ -92,9 +97,9 @@ def test_filter_documents_semantic_id_no_title() -> None:
def test_filter_documents_multiple_sections() -> None:
doc = create_test_document(
sections=[
Section(text="Content 1", link="test_link"),
Section(text="Content 2", link="test_link"),
Section(text="Content 3", link="test_link"),
TextSection(text="Content 1", link="test_link"),
TextSection(text="Content 2", link="test_link"),
TextSection(text="Content 3", link="test_link"),
]
)
result = filter_documents([doc])
@ -106,7 +111,7 @@ def test_filter_documents_multiple_documents() -> None:
docs = [
create_test_document(doc_id="1", title="Title 1"),
create_test_document(
doc_id="2", title="", sections=[Section(text="", link="test_link")]
doc_id="2", title="", sections=[TextSection(text="", link="test_link")]
), # Should be filtered
create_test_document(doc_id="3", title="Title 3"),
]

View File

@ -12,4 +12,4 @@
}
],
"origins": []
}
}