diff --git a/backend/onyx/background/indexing/run_indexing.py b/backend/onyx/background/indexing/run_indexing.py index cafd653b5..612cacc3e 100644 --- a/backend/onyx/background/indexing/run_indexing.py +++ b/backend/onyx/background/indexing/run_indexing.py @@ -28,6 +28,7 @@ from onyx.connectors.models import ConnectorCheckpoint from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document from onyx.connectors.models import IndexAttemptMetadata +from onyx.connectors.models import TextSection from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id from onyx.db.connector_credential_pair import get_last_successful_attempt_time from onyx.db.connector_credential_pair import update_connector_credential_pair @@ -154,14 +155,12 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]: ) for section in cleaned_doc.sections: - if section.link and "\x00" in section.link: - logger.warning( - f"NUL characters found in document link for document: {cleaned_doc.id}" - ) + if section.link is not None: section.link = section.link.replace("\x00", "") # since text can be longer, just replace to avoid double scan - section.text = section.text.replace("\x00", "") + if isinstance(section, TextSection) and section.text is not None: + section.text = section.text.replace("\x00", "") cleaned_batch.append(cleaned_doc) @@ -479,7 +478,11 @@ def _run_indexing( doc_size = 0 for section in doc.sections: - doc_size += len(section.text) + if ( + isinstance(section, TextSection) + and section.text is not None + ): + doc_size += len(section.text) if doc_size > INDEXING_SIZE_WARNING_THRESHOLD: logger.warning( diff --git a/backend/onyx/connectors/airtable/airtable_connector.py b/backend/onyx/connectors/airtable/airtable_connector.py index 689c7b45b..f126de7fe 100644 --- a/backend/onyx/connectors/airtable/airtable_connector.py +++ b/backend/onyx/connectors/airtable/airtable_connector.py @@ -4,6 +4,7 @@ from concurrent.futures import Future from concurrent.futures import ThreadPoolExecutor from io import BytesIO from typing import Any +from typing import cast import requests from pyairtable import Api as AirtableApi @@ -16,7 +17,8 @@ from onyx.configs.constants import DocumentSource from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.extract_file_text import get_file_ext from onyx.utils.logger import setup_logger @@ -267,7 +269,7 @@ class AirtableConnector(LoadConnector): table_id: str, view_id: str | None, record_id: str, - ) -> tuple[list[Section], dict[str, str | list[str]]]: + ) -> tuple[list[TextSection], dict[str, str | list[str]]]: """ Process a single Airtable field and return sections or metadata. @@ -305,7 +307,7 @@ class AirtableConnector(LoadConnector): # Otherwise, create relevant sections sections = [ - Section( + TextSection( link=link, text=( f"{field_name}:\n" @@ -340,7 +342,7 @@ class AirtableConnector(LoadConnector): table_name = table_schema.name record_id = record["id"] fields = record["fields"] - sections: list[Section] = [] + sections: list[TextSection] = [] metadata: dict[str, str | list[str]] = {} # Get primary field value if it exists @@ -384,7 +386,7 @@ class AirtableConnector(LoadConnector): return Document( id=f"airtable__{record_id}", - sections=sections, + sections=(cast(list[TextSection | ImageSection], sections)), source=DocumentSource.AIRTABLE, semantic_identifier=semantic_id, metadata=metadata, diff --git a/backend/onyx/connectors/asana/connector.py b/backend/onyx/connectors/asana/connector.py index a13118d54..e68c0690c 100755 --- a/backend/onyx/connectors/asana/connector.py +++ b/backend/onyx/connectors/asana/connector.py @@ -10,7 +10,7 @@ from onyx.connectors.interfaces import LoadConnector from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger logger = setup_logger() @@ -82,7 +82,7 @@ class AsanaConnector(LoadConnector, PollConnector): logger.debug(f"Converting Asana task {task.id} to Document") return Document( id=task.id, - sections=[Section(link=task.link, text=task.text)], + sections=[TextSection(link=task.link, text=task.text)], doc_updated_at=task.last_modified, source=DocumentSource.ASANA, semantic_identifier=task.title, diff --git a/backend/onyx/connectors/axero/connector.py b/backend/onyx/connectors/axero/connector.py index 8dd824b83..57130b9b9 100644 --- a/backend/onyx/connectors/axero/connector.py +++ b/backend/onyx/connectors/axero/connector.py @@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import retry_builder @@ -221,7 +221,7 @@ def _get_forums( def _translate_forum_to_doc(af: AxeroForum) -> Document: doc = Document( id=af.doc_id, - sections=[Section(link=af.link, text=reply) for reply in af.responses], + sections=[TextSection(link=af.link, text=reply) for reply in af.responses], source=DocumentSource.AXERO, semantic_identifier=af.title, doc_updated_at=af.last_update, @@ -244,7 +244,7 @@ def _translate_content_to_doc(content: dict) -> Document: doc = Document( id="AXERO_" + str(content["ContentID"]), - sections=[Section(link=content["ContentURL"], text=page_text)], + sections=[TextSection(link=content["ContentURL"], text=page_text)], source=DocumentSource.AXERO, semantic_identifier=content["ContentTitle"], doc_updated_at=time_str_to_utc(content["DateUpdated"]), diff --git a/backend/onyx/connectors/blob/connector.py b/backend/onyx/connectors/blob/connector.py index db5d8feb6..6604458a2 100644 --- a/backend/onyx/connectors/blob/connector.py +++ b/backend/onyx/connectors/blob/connector.py @@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import extract_file_text from onyx.utils.logger import setup_logger @@ -208,7 +208,7 @@ class BlobStorageConnector(LoadConnector, PollConnector): batch.append( Document( id=f"{self.bucket_type}:{self.bucket_name}:{obj['Key']}", - sections=[Section(link=link, text=text)], + sections=[TextSection(link=link, text=text)], source=DocumentSource(self.bucket_type.value), semantic_identifier=name, doc_updated_at=last_modified, @@ -341,7 +341,14 @@ if __name__ == "__main__": print("Sections:") for section in doc.sections: print(f" - Link: {section.link}") - print(f" - Text: {section.text[:100]}...") + if isinstance(section, TextSection) and section.text is not None: + print(f" - Text: {section.text[:100]}...") + elif ( + hasattr(section, "image_file_name") and section.image_file_name + ): + print(f" - Image: {section.image_file_name}") + else: + print("Error: Unknown section type") print("---") break diff --git a/backend/onyx/connectors/bookstack/connector.py b/backend/onyx/connectors/bookstack/connector.py index c49d7d469..65cd12f4f 100644 --- a/backend/onyx/connectors/bookstack/connector.py +++ b/backend/onyx/connectors/bookstack/connector.py @@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic @@ -81,7 +81,7 @@ class BookstackConnector(LoadConnector, PollConnector): ) return Document( id="book__" + str(book.get("id")), - sections=[Section(link=url, text=text)], + sections=[TextSection(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Book: " + title, title=title, @@ -110,7 +110,7 @@ class BookstackConnector(LoadConnector, PollConnector): ) return Document( id="chapter__" + str(chapter.get("id")), - sections=[Section(link=url, text=text)], + sections=[TextSection(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Chapter: " + title, title=title, @@ -134,7 +134,7 @@ class BookstackConnector(LoadConnector, PollConnector): ) return Document( id="shelf:" + str(shelf.get("id")), - sections=[Section(link=url, text=text)], + sections=[TextSection(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Shelf: " + title, title=title, @@ -167,7 +167,7 @@ class BookstackConnector(LoadConnector, PollConnector): time.sleep(0.1) return Document( id="page:" + page_id, - sections=[Section(link=url, text=text)], + sections=[TextSection(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Page: " + str(title), title=str(title), diff --git a/backend/onyx/connectors/clickup/connector.py b/backend/onyx/connectors/clickup/connector.py index 2370629d1..906007534 100644 --- a/backend/onyx/connectors/clickup/connector.py +++ b/backend/onyx/connectors/clickup/connector.py @@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.retry_wrapper import retry_builder @@ -62,11 +62,11 @@ class ClickupConnector(LoadConnector, PollConnector): return response.json() - def _get_task_comments(self, task_id: str) -> list[Section]: + def _get_task_comments(self, task_id: str) -> list[TextSection]: url_endpoint = f"/task/{task_id}/comment" response = self._make_request(url_endpoint) comments = [ - Section( + TextSection( link=f'https://app.clickup.com/t/{task_id}?comment={comment_dict["id"]}', text=comment_dict["comment_text"], ) @@ -133,7 +133,7 @@ class ClickupConnector(LoadConnector, PollConnector): ], title=task["name"], sections=[ - Section( + TextSection( link=task["url"], text=( task["markdown_description"] diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py index 05b2692a2..ce2a81fca 100644 --- a/backend/onyx/connectors/confluence/connector.py +++ b/backend/onyx/connectors/confluence/connector.py @@ -33,9 +33,9 @@ from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection from onyx.connectors.models import SlimDocument -from onyx.connectors.vision_enabled_connector import VisionEnabledConnector +from onyx.connectors.models import TextSection from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.utils.logger import setup_logger @@ -85,7 +85,6 @@ class ConfluenceConnector( PollConnector, SlimConnector, CredentialsConnector, - VisionEnabledConnector, ): def __init__( self, @@ -116,9 +115,6 @@ class ConfluenceConnector( self._confluence_client: OnyxConfluence | None = None self._fetched_titles: set[str] = set() - # Initialize vision LLM using the mixin - self.initialize_vision_llm() - # Remove trailing slash from wiki_base if present self.wiki_base = wiki_base.rstrip("/") """ @@ -245,12 +241,16 @@ class ConfluenceConnector( ) # Create the main section for the page content - sections = [Section(text=page_content, link=page_url)] + sections: list[TextSection | ImageSection] = [ + TextSection(text=page_content, link=page_url) + ] # Process comments if available comment_text = self._get_comment_string_for_page_id(page_id) if comment_text: - sections.append(Section(text=comment_text, link=f"{page_url}#comments")) + sections.append( + TextSection(text=comment_text, link=f"{page_url}#comments") + ) # Process attachments if "children" in page and "attachment" in page["children"]: @@ -264,21 +264,26 @@ class ConfluenceConnector( self.confluence_client, attachment, page_id, - page_title, - self.image_analysis_llm, ) - if result.text: + if result and result.text: # Create a section for the attachment text - attachment_section = Section( + attachment_section = TextSection( text=result.text, link=f"{page_url}#attachment-{attachment['id']}", + ) + sections.append(attachment_section) + elif result and result.file_name: + # Create an ImageSection for image attachments + image_section = ImageSection( + link=f"{page_url}#attachment-{attachment['id']}", image_file_name=result.file_name, ) - sections.append(attachment_section) - elif result.error: + sections.append(image_section) + else: logger.warning( - f"Error processing attachment '{attachment.get('title')}': {result.error}" + f"Error processing attachment '{attachment.get('title')}':", + f"{result.error if result else 'Unknown error'}", ) # Extract metadata @@ -349,7 +354,7 @@ class ConfluenceConnector( # Now get attachments for that page: attachment_query = self._construct_attachment_query(page["id"]) # We'll use the page's XML to provide context if we summarize an image - confluence_xml = page.get("body", {}).get("storage", {}).get("value", "") + page.get("body", {}).get("storage", {}).get("value", "") for attachment in self.confluence_client.paginated_cql_retrieval( cql=attachment_query, @@ -357,7 +362,7 @@ class ConfluenceConnector( ): attachment["metadata"].get("mediaType", "") if not validate_attachment_filetype( - attachment, self.image_analysis_llm + attachment, ): continue @@ -368,23 +373,25 @@ class ConfluenceConnector( confluence_client=self.confluence_client, attachment=attachment, page_id=page["id"], - page_context=confluence_xml, - llm=self.image_analysis_llm, ) if response is None: continue content_text, file_storage_name = response - object_url = build_confluence_document_id( self.wiki_base, attachment["_links"]["webui"], self.is_cloud ) - if content_text: doc.sections.append( - Section( + TextSection( text=content_text, link=object_url, + ) + ) + elif file_storage_name: + doc.sections.append( + ImageSection( + link=object_url, image_file_name=file_storage_name, ) ) @@ -464,7 +471,7 @@ class ConfluenceConnector( # If you skip images, you'll skip them in the permission sync attachment["metadata"].get("mediaType", "") if not validate_attachment_filetype( - attachment, self.image_analysis_llm + attachment, ): continue diff --git a/backend/onyx/connectors/confluence/utils.py b/backend/onyx/connectors/confluence/utils.py index d319f12ce..1b5e08485 100644 --- a/backend/onyx/connectors/confluence/utils.py +++ b/backend/onyx/connectors/confluence/utils.py @@ -36,7 +36,6 @@ from onyx.db.pg_file_store import upsert_pgfilestore from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.file_validation import is_valid_image_type from onyx.file_processing.image_utils import store_image_and_create_section -from onyx.llm.interfaces import LLM from onyx.utils.logger import setup_logger logger = setup_logger() @@ -54,17 +53,16 @@ class TokenResponse(BaseModel): def validate_attachment_filetype( - attachment: dict[str, Any], llm: LLM | None = None + attachment: dict[str, Any], ) -> bool: """ Validates if the attachment is a supported file type. - If LLM is provided, also checks if it's an image that can be processed. """ attachment.get("metadata", {}) media_type = attachment.get("metadata", {}).get("mediaType", "") if media_type.startswith("image/"): - return llm is not None and is_valid_image_type(media_type) + return is_valid_image_type(media_type) # For non-image files, check if we support the extension title = attachment.get("title", "") @@ -114,19 +112,16 @@ def process_attachment( confluence_client: "OnyxConfluence", attachment: dict[str, Any], parent_content_id: str | None, - page_context: str, - llm: LLM | None, ) -> AttachmentProcessingResult: """ Processes a Confluence attachment. If it's a document, extracts text, - or if it's an image and an LLM is available, summarizes it. Returns a structured result. + or if it's an image, stores it for later analysis. Returns a structured result. """ try: # Get the media type from the attachment metadata media_type = attachment.get("metadata", {}).get("mediaType", "") - # Validate the attachment type - if not validate_attachment_filetype(attachment, llm): + if not validate_attachment_filetype(attachment): return AttachmentProcessingResult( text=None, file_name=None, @@ -143,7 +138,7 @@ def process_attachment( attachment_size = attachment["extensions"]["fileSize"] - if not media_type.startswith("image/") or not llm: + if not media_type.startswith("image/"): if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: logger.warning( f"Skipping {attachment_link} due to size. " @@ -181,10 +176,10 @@ def process_attachment( text=None, file_name=None, error="attachment.content is None" ) - # Process image attachments with LLM if available - if media_type.startswith("image/") and llm: + # Process image attachments + if media_type.startswith("image/"): return _process_image_attachment( - confluence_client, attachment, page_context, llm, raw_bytes, media_type + confluence_client, attachment, raw_bytes, media_type ) # Process document attachments @@ -217,12 +212,10 @@ def process_attachment( def _process_image_attachment( confluence_client: "OnyxConfluence", attachment: dict[str, Any], - page_context: str, - llm: LLM, raw_bytes: bytes, media_type: str, ) -> AttachmentProcessingResult: - """Process an image attachment by saving it and generating a summary.""" + """Process an image attachment by saving it without generating a summary.""" try: # Use the standardized image storage and section creation with get_session_with_current_tenant() as db_session: @@ -232,15 +225,14 @@ def _process_image_attachment( file_name=Path(attachment["id"]).name, display_name=attachment["title"], media_type=media_type, - llm=llm, file_origin=FileOrigin.CONNECTOR, ) + logger.info(f"Stored image attachment with file name: {file_name}") - return AttachmentProcessingResult( - text=section.text, file_name=file_name, error=None - ) + # Return empty text but include the file_name for later processing + return AttachmentProcessingResult(text="", file_name=file_name, error=None) except Exception as e: - msg = f"Image summarization failed for {attachment['title']}: {e}" + msg = f"Image storage failed for {attachment['title']}: {e}" logger.error(msg, exc_info=e) return AttachmentProcessingResult(text=None, file_name=None, error=msg) @@ -302,13 +294,11 @@ def convert_attachment_to_content( confluence_client: "OnyxConfluence", attachment: dict[str, Any], page_id: str, - page_context: str, - llm: LLM | None, ) -> tuple[str | None, str | None] | None: """ Facade function which: 1. Validates attachment type - 2. Extracts or summarizes content + 2. Extracts content or stores image for later processing 3. Returns (content_text, stored_file_name) or None if we should skip it """ media_type = attachment["metadata"]["mediaType"] @@ -319,9 +309,7 @@ def convert_attachment_to_content( ) return None - result = process_attachment( - confluence_client, attachment, page_id, page_context, llm - ) + result = process_attachment(confluence_client, attachment, page_id) if result.error is not None: logger.warning( f"Attachment {attachment['title']} encountered error: {result.error}" diff --git a/backend/onyx/connectors/discord/connector.py b/backend/onyx/connectors/discord/connector.py index eede9b787..8843d6513 100644 --- a/backend/onyx/connectors/discord/connector.py +++ b/backend/onyx/connectors/discord/connector.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from datetime import datetime from datetime import timezone from typing import Any +from typing import cast from discord import Client from discord.channel import TextChannel @@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger logger = setup_logger() @@ -32,7 +34,7 @@ _SNIPPET_LENGTH = 30 def _convert_message_to_document( message: DiscordMessage, - sections: list[Section], + sections: list[TextSection], ) -> Document: """ Convert a discord message to a document @@ -78,7 +80,7 @@ def _convert_message_to_document( semantic_identifier=semantic_identifier, doc_updated_at=message.edited_at, title=title, - sections=sections, + sections=(cast(list[TextSection | ImageSection], sections)), metadata=metadata, ) @@ -123,8 +125,8 @@ async def _fetch_documents_from_channel( if channel_message.type != MessageType.default: continue - sections: list[Section] = [ - Section( + sections: list[TextSection] = [ + TextSection( text=channel_message.content, link=channel_message.jump_url, ) @@ -142,7 +144,7 @@ async def _fetch_documents_from_channel( continue sections = [ - Section( + TextSection( text=thread_message.content, link=thread_message.jump_url, ) @@ -160,7 +162,7 @@ async def _fetch_documents_from_channel( continue sections = [ - Section( + TextSection( text=thread_message.content, link=thread_message.jump_url, ) diff --git a/backend/onyx/connectors/discourse/connector.py b/backend/onyx/connectors/discourse/connector.py index c25224a4c..807095b92 100644 --- a/backend/onyx/connectors/discourse/connector.py +++ b/backend/onyx/connectors/discourse/connector.py @@ -3,6 +3,7 @@ import urllib.parse from datetime import datetime from datetime import timezone from typing import Any +from typing import cast import requests from pydantic import BaseModel @@ -20,7 +21,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import retry_builder @@ -112,7 +114,7 @@ class DiscourseConnector(PollConnector): responders.append(BasicExpertInfo(display_name=responder_name)) sections.append( - Section(link=topic_url, text=parse_html_page_basic(post["cooked"])) + TextSection(link=topic_url, text=parse_html_page_basic(post["cooked"])) ) category_name = self.category_id_map.get(topic["category_id"]) @@ -129,7 +131,7 @@ class DiscourseConnector(PollConnector): doc = Document( id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]), - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.DISCOURSE, semantic_identifier=topic["title"], doc_updated_at=time_str_to_utc(topic["last_posted_at"]), diff --git a/backend/onyx/connectors/document360/connector.py b/backend/onyx/connectors/document360/connector.py index dbd287ae9..34dc33c41 100644 --- a/backend/onyx/connectors/document360/connector.py +++ b/backend/onyx/connectors/document360/connector.py @@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.retry_wrapper import retry_builder @@ -158,7 +158,7 @@ class Document360Connector(LoadConnector, PollConnector): document = Document( id=article_details["id"], - sections=[Section(link=doc_link, text=doc_text)], + sections=[TextSection(link=doc_link, text=doc_text)], source=DocumentSource.DOCUMENT360, semantic_identifier=article_details["title"], doc_updated_at=updated_at, diff --git a/backend/onyx/connectors/dropbox/connector.py b/backend/onyx/connectors/dropbox/connector.py index 49cade07e..4999a09c9 100644 --- a/backend/onyx/connectors/dropbox/connector.py +++ b/backend/onyx/connectors/dropbox/connector.py @@ -19,7 +19,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import extract_file_text from onyx.utils.logger import setup_logger @@ -108,7 +108,7 @@ class DropboxConnector(LoadConnector, PollConnector): batch.append( Document( id=f"doc:{entry.id}", - sections=[Section(link=link, text=text)], + sections=[TextSection(link=link, text=text)], source=DocumentSource.DROPBOX, semantic_identifier=entry.name, doc_updated_at=modified_time, diff --git a/backend/onyx/connectors/egnyte/connector.py b/backend/onyx/connectors/egnyte/connector.py index 4a3d65824..fac9c456b 100644 --- a/backend/onyx/connectors/egnyte/connector.py +++ b/backend/onyx/connectors/egnyte/connector.py @@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import detect_encoding from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.extract_file_text import get_file_ext @@ -111,7 +111,7 @@ def _process_egnyte_file( # Create the document return Document( id=f"egnyte-{file_metadata['entry_id']}", - sections=[Section(text=file_content_raw.strip(), link=web_url)], + sections=[TextSection(text=file_content_raw.strip(), link=web_url)], source=DocumentSource.EGNYTE, semantic_identifier=file_name, metadata=metadata, diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py index 83c0426f4..9a86f8d4a 100644 --- a/backend/onyx/connectors/file/connector.py +++ b/backend/onyx/connectors/file/connector.py @@ -16,8 +16,8 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section -from onyx.connectors.vision_enabled_connector import VisionEnabledConnector +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.db.engine import get_session_with_current_tenant from onyx.db.pg_file_store import get_pgfilestore_by_file_name from onyx.file_processing.extract_file_text import extract_text_and_images @@ -26,7 +26,6 @@ from onyx.file_processing.extract_file_text import is_valid_file_ext from onyx.file_processing.extract_file_text import load_files_from_zip from onyx.file_processing.image_utils import store_image_and_create_section from onyx.file_store.file_store import get_default_file_store -from onyx.llm.interfaces import LLM from onyx.utils.logger import setup_logger logger = setup_logger() @@ -59,32 +58,44 @@ def _read_files_and_metadata( def _create_image_section( - llm: LLM | None, image_data: bytes, db_session: Session, parent_file_name: str, display_name: str, + link: str | None = None, idx: int = 0, -) -> tuple[Section, str | None]: +) -> tuple[ImageSection, str | None]: """ - Create a Section object for a single image and store the image in PGFileStore. - If summarization is enabled and we have an LLM, summarize the image. + Creates an ImageSection for an image file or embedded image. + Stores the image in PGFileStore but does not generate a summary. + + Args: + image_data: Raw image bytes + db_session: Database session + parent_file_name: Name of the parent file (for embedded images) + display_name: Display name for the image + idx: Index for embedded images Returns: - tuple: (Section object, file_name in PGFileStore or None if storage failed) + Tuple of (ImageSection, stored_file_name or None) """ - # Create a unique file name for the embedded image - file_name = f"{parent_file_name}_embedded_{idx}" + # Create a unique identifier for the image + file_name = f"{parent_file_name}_embedded_{idx}" if idx > 0 else parent_file_name - # Use the standardized utility to store the image and create a section - return store_image_and_create_section( - db_session=db_session, - image_data=image_data, - file_name=file_name, - display_name=display_name, - llm=llm, - file_origin=FileOrigin.OTHER, - ) + # Store the image and create a section + try: + section, stored_file_name = store_image_and_create_section( + db_session=db_session, + image_data=image_data, + file_name=file_name, + display_name=display_name, + link=link, + file_origin=FileOrigin.CONNECTOR, + ) + return section, stored_file_name + except Exception as e: + logger.error(f"Failed to store image {display_name}: {e}") + raise e def _process_file( @@ -93,12 +104,16 @@ def _process_file( metadata: dict[str, Any] | None, pdf_pass: str | None, db_session: Session, - llm: LLM | None, ) -> list[Document]: """ - Processes a single file, returning a list of Documents (typically one). - Also handles embedded images if 'EMBEDDED_IMAGE_EXTRACTION_ENABLED' is true. + Process a file and return a list of Documents. + For images, creates ImageSection objects without summarization. + For documents with embedded images, extracts and stores the images. """ + if metadata is None: + metadata = {} + + # Get file extension and determine file type extension = get_file_ext(file_name) # Fetch the DB record so we know the ID for internal URL @@ -114,8 +129,6 @@ def _process_file( return [] # Prepare doc metadata - if metadata is None: - metadata = {} file_display_name = metadata.get("file_display_name") or os.path.basename(file_name) # Timestamps @@ -158,6 +171,7 @@ def _process_file( "title", "connector_type", "pdf_password", + "mime_type", ] } @@ -170,33 +184,45 @@ def _process_file( title = metadata.get("title") or file_display_name # 1) If the file itself is an image, handle that scenario quickly - IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"} - if extension in IMAGE_EXTENSIONS: - # Summarize or produce empty doc + if extension in LoadConnector.IMAGE_EXTENSIONS: + # Read the image data image_data = file.read() - image_section, _ = _create_image_section( - llm, image_data, db_session, pg_record.file_name, title - ) - return [ - Document( - id=doc_id, - sections=[image_section], - source=source_type, - semantic_identifier=file_display_name, - title=title, - doc_updated_at=final_time_updated, - primary_owners=p_owners, - secondary_owners=s_owners, - metadata=metadata_tags, - ) - ] + if not image_data: + logger.warning(f"Empty image file: {file_name}") + return [] - # 2) Otherwise: text-based approach. Possibly with embedded images if enabled. - # (For example .docx with inline images). + # Create an ImageSection for the image + try: + section, _ = _create_image_section( + image_data=image_data, + db_session=db_session, + parent_file_name=pg_record.file_name, + display_name=title, + ) + + return [ + Document( + id=doc_id, + sections=[section], + source=source_type, + semantic_identifier=file_display_name, + title=title, + doc_updated_at=final_time_updated, + primary_owners=p_owners, + secondary_owners=s_owners, + metadata=metadata_tags, + ) + ] + except Exception as e: + logger.error(f"Failed to process image file {file_name}: {e}") + return [] + + # 2) Otherwise: text-based approach. Possibly with embedded images. file.seek(0) text_content = "" embedded_images: list[tuple[bytes, str]] = [] + # Extract text and images from the file text_content, embedded_images = extract_text_and_images( file=file, file_name=file_name, @@ -204,24 +230,29 @@ def _process_file( ) # Build sections: first the text as a single Section - sections = [] + sections: list[TextSection | ImageSection] = [] link_in_meta = metadata.get("link") if text_content.strip(): - sections.append(Section(link=link_in_meta, text=text_content.strip())) + sections.append(TextSection(link=link_in_meta, text=text_content.strip())) # Then any extracted images from docx, etc. for idx, (img_data, img_name) in enumerate(embedded_images, start=1): # Store each embedded image as a separate file in PGFileStore - # and create a section with the image summary - image_section, _ = _create_image_section( - llm, - img_data, - db_session, - pg_record.file_name, - f"{title} - image {idx}", - idx, - ) - sections.append(image_section) + # and create a section with the image reference + try: + image_section, _ = _create_image_section( + image_data=img_data, + db_session=db_session, + parent_file_name=pg_record.file_name, + display_name=f"{title} - image {idx}", + idx=idx, + ) + sections.append(image_section) + except Exception as e: + logger.warning( + f"Failed to process embedded image {idx} in {file_name}: {e}" + ) + return [ Document( id=doc_id, @@ -237,10 +268,10 @@ def _process_file( ] -class LocalFileConnector(LoadConnector, VisionEnabledConnector): +class LocalFileConnector(LoadConnector): """ Connector that reads files from Postgres and yields Documents, including - optional embedded image extraction. + embedded image extraction without summarization. """ def __init__( @@ -252,9 +283,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector): self.batch_size = batch_size self.pdf_pass: str | None = None - # Initialize vision LLM using the mixin - self.initialize_vision_llm() - def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: self.pdf_pass = credentials.get("pdf_password") @@ -286,7 +314,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector): metadata=metadata, pdf_pass=self.pdf_pass, db_session=db_session, - llm=self.image_analysis_llm, ) documents.extend(new_docs) diff --git a/backend/onyx/connectors/fireflies/connector.py b/backend/onyx/connectors/fireflies/connector.py index 881700fe3..7fda1bec3 100644 --- a/backend/onyx/connectors/fireflies/connector.py +++ b/backend/onyx/connectors/fireflies/connector.py @@ -1,6 +1,7 @@ from collections.abc import Iterator from datetime import datetime from datetime import timezone +from typing import cast from typing import List import requests @@ -14,7 +15,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger logger = setup_logger() @@ -45,7 +47,7 @@ _FIREFLIES_API_QUERY = """ def _create_doc_from_transcript(transcript: dict) -> Document | None: - sections: List[Section] = [] + sections: List[TextSection] = [] current_speaker_name = None current_link = "" current_text = "" @@ -57,7 +59,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None: if sentence["speaker_name"] != current_speaker_name: if current_speaker_name is not None: sections.append( - Section( + TextSection( link=current_link, text=current_text.strip(), ) @@ -71,7 +73,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None: # Sometimes these links (links with a timestamp) do not work, it is a bug with Fireflies. sections.append( - Section( + TextSection( link=current_link, text=current_text.strip(), ) @@ -94,7 +96,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None: return Document( id=fireflies_id, - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.FIREFLIES, semantic_identifier=meeting_title, metadata={}, diff --git a/backend/onyx/connectors/freshdesk/connector.py b/backend/onyx/connectors/freshdesk/connector.py index 96ea5abbe..e68fa669b 100644 --- a/backend/onyx/connectors/freshdesk/connector.py +++ b/backend/onyx/connectors/freshdesk/connector.py @@ -14,7 +14,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.logger import setup_logger @@ -133,7 +133,7 @@ def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: return Document( id=_FRESHDESK_ID_PREFIX + link, sections=[ - Section( + TextSection( link=link, text=text, ) diff --git a/backend/onyx/connectors/gitbook/connector.py b/backend/onyx/connectors/gitbook/connector.py index 26f8e1aa3..cf133c9cf 100644 --- a/backend/onyx/connectors/gitbook/connector.py +++ b/backend/onyx/connectors/gitbook/connector.py @@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger @@ -183,7 +183,7 @@ def _convert_page_to_document( return Document( id=f"gitbook-{space_id}-{page_id}", sections=[ - Section( + TextSection( link=page.get("urls", {}).get("app", ""), text=_extract_text_from_document(page_content), ) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 62a526a72..064613517 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -27,7 +27,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.batching import batch_generator from onyx.utils.logger import setup_logger @@ -87,7 +87,9 @@ def _batch_github_objects( def _convert_pr_to_document(pull_request: PullRequest) -> Document: return Document( id=pull_request.html_url, - sections=[Section(link=pull_request.html_url, text=pull_request.body or "")], + sections=[ + TextSection(link=pull_request.html_url, text=pull_request.body or "") + ], source=DocumentSource.GITHUB, semantic_identifier=pull_request.title, # updated_at is UTC time but is timezone unaware, explicitly add UTC @@ -109,7 +111,7 @@ def _fetch_issue_comments(issue: Issue) -> str: def _convert_issue_to_document(issue: Issue) -> Document: return Document( id=issue.html_url, - sections=[Section(link=issue.html_url, text=issue.body or "")], + sections=[TextSection(link=issue.html_url, text=issue.body or "")], source=DocumentSource.GITHUB, semantic_identifier=issue.title, # updated_at is UTC time but is timezone unaware diff --git a/backend/onyx/connectors/gitlab/connector.py b/backend/onyx/connectors/gitlab/connector.py index ffd01805b..ecf395b75 100644 --- a/backend/onyx/connectors/gitlab/connector.py +++ b/backend/onyx/connectors/gitlab/connector.py @@ -21,7 +21,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger @@ -56,7 +56,7 @@ def get_author(author: Any) -> BasicExpertInfo: def _convert_merge_request_to_document(mr: Any) -> Document: doc = Document( id=mr.web_url, - sections=[Section(link=mr.web_url, text=mr.description or "")], + sections=[TextSection(link=mr.web_url, text=mr.description or "")], source=DocumentSource.GITLAB, semantic_identifier=mr.title, # updated_at is UTC time but is timezone unaware, explicitly add UTC @@ -72,7 +72,7 @@ def _convert_merge_request_to_document(mr: Any) -> Document: def _convert_issue_to_document(issue: Any) -> Document: doc = Document( id=issue.web_url, - sections=[Section(link=issue.web_url, text=issue.description or "")], + sections=[TextSection(link=issue.web_url, text=issue.description or "")], source=DocumentSource.GITLAB, semantic_identifier=issue.title, # updated_at is UTC time but is timezone unaware, explicitly add UTC @@ -99,7 +99,7 @@ def _convert_code_to_document( file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL doc = Document( id=file["id"], - sections=[Section(link=file_url, text=file_content)], + sections=[TextSection(link=file_url, text=file_content)], source=DocumentSource.GITLAB, semantic_identifier=file["name"], doc_updated_at=datetime.now().replace( diff --git a/backend/onyx/connectors/gmail/connector.py b/backend/onyx/connectors/gmail/connector.py index 177264afd..6391a96aa 100644 --- a/backend/onyx/connectors/gmail/connector.py +++ b/backend/onyx/connectors/gmail/connector.py @@ -1,5 +1,6 @@ from base64 import urlsafe_b64decode from typing import Any +from typing import cast from typing import Dict from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore @@ -28,8 +29,9 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import retry_builder @@ -115,7 +117,7 @@ def _get_message_body(payload: dict[str, Any]) -> str: return message_body -def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]]: +def message_to_section(message: Dict[str, Any]) -> tuple[TextSection, dict[str, str]]: link = f"https://mail.google.com/mail/u/0/#inbox/{message['id']}" payload = message.get("payload", {}) @@ -142,7 +144,7 @@ def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str] message_body_text: str = _get_message_body(payload) - return Section(link=link, text=message_body_text + message_data), metadata + return TextSection(link=link, text=message_body_text + message_data), metadata def thread_to_document(full_thread: Dict[str, Any]) -> Document | None: @@ -192,7 +194,7 @@ def thread_to_document(full_thread: Dict[str, Any]) -> Document | None: return Document( id=id, semantic_identifier=semantic_identifier, - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.GMAIL, # This is used to perform permission sync primary_owners=primary_owners, diff --git a/backend/onyx/connectors/gong/connector.py b/backend/onyx/connectors/gong/connector.py index 16bc5c2a6..b8fc5026a 100644 --- a/backend/onyx/connectors/gong/connector.py +++ b/backend/onyx/connectors/gong/connector.py @@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger @@ -243,7 +243,7 @@ class GongConnector(LoadConnector, PollConnector): Document( id=call_id, sections=[ - Section(link=call_metadata["url"], text=transcript_text) + TextSection(link=call_metadata["url"], text=transcript_text) ], source=DocumentSource.GONG, # Should not ever be Untitled as a call cannot be made without a Title diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index b6fa22739..c3a085b06 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -43,9 +43,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import ConnectorMissingCredentialError -from onyx.connectors.vision_enabled_connector import VisionEnabledConnector from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface -from onyx.llm.interfaces import LLM from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import retry_builder @@ -68,7 +66,6 @@ def _convert_single_file( creds: Any, primary_admin_email: str, file: dict[str, Any], - image_analysis_llm: LLM | None, ) -> Any: user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email user_drive_service = get_drive_service(creds, user_email=user_email) @@ -77,7 +74,6 @@ def _convert_single_file( file=file, drive_service=user_drive_service, docs_service=docs_service, - image_analysis_llm=image_analysis_llm, # pass the LLM so doc_conversion can summarize images ) @@ -116,9 +112,7 @@ def _clean_requested_drive_ids( return valid_requested_drive_ids, filtered_folder_ids -class GoogleDriveConnector( - LoadConnector, PollConnector, SlimConnector, VisionEnabledConnector -): +class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, include_shared_drives: bool = False, @@ -151,9 +145,6 @@ class GoogleDriveConnector( if continue_on_failure is not None: logger.warning("The 'continue_on_failure' parameter is deprecated.") - # Initialize vision LLM using the mixin - self.initialize_vision_llm() - if ( not include_shared_drives and not include_my_drives @@ -539,7 +530,6 @@ class GoogleDriveConnector( _convert_single_file, self.creds, self.primary_admin_email, - image_analysis_llm=self.image_analysis_llm, # Use the mixin's LLM ) # Fetch files in batches diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 6a447aa4f..b1c3d8c1a 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -1,40 +1,50 @@ import io from datetime import datetime -from datetime import timezone -from tempfile import NamedTemporaryFile +from typing import cast -import openpyxl # type: ignore -from googleapiclient.discovery import build # type: ignore -from googleapiclient.errors import HttpError # type: ignore +from googleapiclient.http import MediaIoBaseDownload # type: ignore -from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from onyx.configs.constants import DocumentSource from onyx.configs.constants import FileOrigin from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE -from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT from onyx.connectors.google_drive.models import GDriveMimeType from onyx.connectors.google_drive.models import GoogleDriveFileType from onyx.connectors.google_drive.section_extraction import get_document_sections from onyx.connectors.google_utils.resources import GoogleDocsService from onyx.connectors.google_utils.resources import GoogleDriveService from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.db.engine import get_session_with_current_tenant from onyx.file_processing.extract_file_text import docx_to_text_and_images +from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.extract_file_text import pptx_to_text from onyx.file_processing.extract_file_text import read_pdf_file +from onyx.file_processing.extract_file_text import xlsx_to_text from onyx.file_processing.file_validation import is_valid_image_type from onyx.file_processing.image_summarization import summarize_image_with_error_handling from onyx.file_processing.image_utils import store_image_and_create_section -from onyx.file_processing.unstructured import get_unstructured_api_key -from onyx.file_processing.unstructured import unstructured_to_text from onyx.llm.interfaces import LLM from onyx.utils.logger import setup_logger logger = setup_logger() +# Mapping of Google Drive mime types to export formats +GOOGLE_MIME_TYPES_TO_EXPORT = { + GDriveMimeType.DOC.value: "text/plain", + GDriveMimeType.SPREADSHEET.value: "text/csv", + GDriveMimeType.PPT.value: "text/plain", +} + +# Define Google MIME types mapping +GOOGLE_MIME_TYPES = { + GDriveMimeType.DOC.value: "text/plain", + GDriveMimeType.SPREADSHEET.value: "text/csv", + GDriveMimeType.PPT.value: "text/plain", +} + def _summarize_drive_image( image_data: bytes, image_name: str, image_analysis_llm: LLM | None @@ -66,259 +76,137 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool: def _extract_sections_basic( file: dict[str, str], service: GoogleDriveService, - image_analysis_llm: LLM | None = None, -) -> list[Section]: - """ - Extends the existing logic to handle either a docx with embedded images - or standalone images (PNG, JPG, etc). - """ +) -> list[TextSection | ImageSection]: + """Extract text and images from a Google Drive file.""" + file_id = file["id"] + file_name = file["name"] mime_type = file["mimeType"] - link = file["webViewLink"] - file_name = file.get("name", file["id"]) - supported_file_types = set(item.value for item in GDriveMimeType) - - # 1) If the file is an image, retrieve the raw bytes, optionally summarize - if is_gdrive_image_mime_type(mime_type): - try: - response = service.files().get_media(fileId=file["id"]).execute() - - with get_session_with_current_tenant() as db_session: - section, _ = store_image_and_create_section( - db_session=db_session, - image_data=response, - file_name=file["id"], - display_name=file_name, - media_type=mime_type, - llm=image_analysis_llm, - file_origin=FileOrigin.CONNECTOR, - ) - return [section] - except Exception as e: - logger.warning(f"Failed to fetch or summarize image: {e}") - return [ - Section( - link=link, - text="", - image_file_name=link, - ) - ] - - if mime_type not in supported_file_types: - # Unsupported file types can still have a title, finding this way is still useful - return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + link = file.get("webViewLink", "") try: - # --------------------------- - # Google Sheets extraction - if mime_type == GDriveMimeType.SPREADSHEET.value: + # For Google Docs, Sheets, and Slides, export as plain text + if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT: + export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type] + # Use the correct API call for exporting files + request = service.files().export_media( + fileId=file_id, mimeType=export_mime_type + ) + response_bytes = io.BytesIO() + downloader = MediaIoBaseDownload(response_bytes, request) + done = False + while not done: + _, done = downloader.next_chunk() + + response = response_bytes.getvalue() + if not response: + logger.warning(f"Failed to export {file_name} as {export_mime_type}") + return [] + + text = response.decode("utf-8") + return [TextSection(link=link, text=text)] + + # For other file types, download the file + # Use the correct API call for downloading files + request = service.files().get_media(fileId=file_id) + response_bytes = io.BytesIO() + downloader = MediaIoBaseDownload(response_bytes, request) + done = False + while not done: + _, done = downloader.next_chunk() + + response = response_bytes.getvalue() + if not response: + logger.warning(f"Failed to download {file_name}") + return [] + + # Process based on mime type + if mime_type == "text/plain": + text = response.decode("utf-8") + return [TextSection(link=link, text=text)] + + elif ( + mime_type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + text, _ = docx_to_text_and_images(io.BytesIO(response)) + return [TextSection(link=link, text=text)] + + elif ( + mime_type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ): + text = xlsx_to_text(io.BytesIO(response)) + return [TextSection(link=link, text=text)] + + elif ( + mime_type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ): + text = pptx_to_text(io.BytesIO(response)) + return [TextSection(link=link, text=text)] + + elif is_gdrive_image_mime_type(mime_type): + # For images, store them for later processing + sections: list[TextSection | ImageSection] = [] try: - sheets_service = build( - "sheets", "v4", credentials=service._http.credentials - ) - spreadsheet = ( - sheets_service.spreadsheets() - .get(spreadsheetId=file["id"]) - .execute() - ) - - sections = [] - for sheet in spreadsheet["sheets"]: - sheet_name = sheet["properties"]["title"] - sheet_id = sheet["properties"]["sheetId"] - - # Get sheet dimensions - grid_properties = sheet["properties"].get("gridProperties", {}) - row_count = grid_properties.get("rowCount", 1000) - column_count = grid_properties.get("columnCount", 26) - - # Convert column count to letter (e.g., 26 -> Z, 27 -> AA) - end_column = "" - while column_count: - column_count, remainder = divmod(column_count - 1, 26) - end_column = chr(65 + remainder) + end_column - - range_name = f"'{sheet_name}'!A1:{end_column}{row_count}" - - try: - result = ( - sheets_service.spreadsheets() - .values() - .get(spreadsheetId=file["id"], range=range_name) - .execute() - ) - values = result.get("values", []) - - if values: - text = f"Sheet: {sheet_name}\n" - for row in values: - text += "\t".join(str(cell) for cell in row) + "\n" - sections.append( - Section( - link=f"{link}#gid={sheet_id}", - text=text, - ) - ) - except HttpError as e: - logger.warning( - f"Error fetching data for sheet '{sheet_name}': {e}" - ) - continue - return sections - - except Exception as e: - logger.warning( - f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'." - " Falling back to basic extraction." - ) - # --------------------------- - # Microsoft Excel (.xlsx or .xls) extraction branch - elif mime_type in [ - GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value, - GDriveMimeType.SPREADSHEET_MS_EXCEL.value, - ]: - try: - response = service.files().get_media(fileId=file["id"]).execute() - - with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp: - tmp.write(response) - tmp_path = tmp.name - - section_separator = "\n\n" - workbook = openpyxl.load_workbook(tmp_path, read_only=True) - - # Work similarly to the xlsx_to_text function used for file connector - # but returns Sections instead of a string - sections = [ - Section( - link=link, - text=( - f"Sheet: {sheet.title}\n\n" - + section_separator.join( - ",".join(map(str, row)) - for row in sheet.iter_rows( - min_row=1, values_only=True - ) - if row - ) - ), - ) - for sheet in workbook.worksheets - ] - - return sections - - except Exception as e: - logger.warning( - f"Error extracting data from Excel file '{file['name']}': {e}" - ) - return [ - Section(link=link, text="Error extracting data from Excel file") - ] - - # --------------------------- - # Export for Google Docs, PPT, and fallback for spreadsheets - if mime_type in [ - GDriveMimeType.DOC.value, - GDriveMimeType.PPT.value, - GDriveMimeType.SPREADSHEET.value, - ]: - export_mime_type = ( - "text/plain" - if mime_type != GDriveMimeType.SPREADSHEET.value - else "text/csv" - ) - text = ( - service.files() - .export(fileId=file["id"], mimeType=export_mime_type) - .execute() - .decode("utf-8") - ) - return [Section(link=link, text=text)] - - # --------------------------- - # Plain text and Markdown files - elif mime_type in [ - GDriveMimeType.PLAIN_TEXT.value, - GDriveMimeType.MARKDOWN.value, - ]: - text_data = ( - service.files().get_media(fileId=file["id"]).execute().decode("utf-8") - ) - return [Section(link=link, text=text_data)] - - # --------------------------- - # Word, PowerPoint, PDF files - elif mime_type in [ - GDriveMimeType.WORD_DOC.value, - GDriveMimeType.POWERPOINT.value, - GDriveMimeType.PDF.value, - ]: - response_bytes = service.files().get_media(fileId=file["id"]).execute() - - # Optionally use Unstructured - if get_unstructured_api_key(): - text = unstructured_to_text( - file=io.BytesIO(response_bytes), - file_name=file_name, - ) - return [Section(link=link, text=text)] - - if mime_type == GDriveMimeType.WORD_DOC.value: - # Use docx_to_text_and_images to get text plus embedded images - text, embedded_images = docx_to_text_and_images( - file=io.BytesIO(response_bytes), - ) - sections = [] - if text.strip(): - sections.append(Section(link=link, text=text.strip())) - - # Process each embedded image using the standardized function with get_session_with_current_tenant() as db_session: - for idx, (img_data, img_name) in enumerate( - embedded_images, start=1 - ): - # Create a unique identifier for the embedded image - embedded_id = f"{file['id']}_embedded_{idx}" + section, embedded_id = store_image_and_create_section( + db_session=db_session, + image_data=response, + file_name=file_id, + display_name=file_name, + media_type=mime_type, + file_origin=FileOrigin.CONNECTOR, + link=link, + ) + sections.append(section) + except Exception as e: + logger.error(f"Failed to process image {file_name}: {e}") + return sections - section, _ = store_image_and_create_section( + elif mime_type == "application/pdf": + text, _pdf_meta, images = read_pdf_file(io.BytesIO(response)) + pdf_sections: list[TextSection | ImageSection] = [ + TextSection(link=link, text=text) + ] + + # Process embedded images in the PDF + try: + with get_session_with_current_tenant() as db_session: + for idx, (img_data, img_name) in enumerate(images): + section, embedded_id = store_image_and_create_section( db_session=db_session, image_data=img_data, - file_name=embedded_id, + file_name=f"{file_id}_img_{idx}", display_name=img_name or f"{file_name} - image {idx}", - llm=image_analysis_llm, file_origin=FileOrigin.CONNECTOR, ) - sections.append(section) - return sections + pdf_sections.append(section) + except Exception as e: + logger.error(f"Failed to process PDF images in {file_name}: {e}") + return pdf_sections - elif mime_type == GDriveMimeType.PDF.value: - text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_bytes)) - return [Section(link=link, text=text)] - - elif mime_type == GDriveMimeType.POWERPOINT.value: - text_data = pptx_to_text(io.BytesIO(response_bytes)) - return [Section(link=link, text=text_data)] - - # Catch-all case, should not happen since there should be specific handling - # for each of the supported file types - error_message = f"Unsupported file type: {mime_type}" - logger.error(error_message) - raise ValueError(error_message) + else: + # For unsupported file types, try to extract text + try: + text = extract_file_text(io.BytesIO(response), file_name) + return [TextSection(link=link, text=text)] + except Exception as e: + logger.warning(f"Failed to extract text from {file_name}: {e}") + return [] except Exception as e: - logger.exception(f"Error extracting sections from file: {e}") - return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + logger.error(f"Error processing file {file_name}: {e}") + return [] def convert_drive_item_to_document( file: GoogleDriveFileType, drive_service: GoogleDriveService, docs_service: GoogleDocsService, - image_analysis_llm: LLM | None, ) -> Document | None: """ Main entry point for converting a Google Drive file => Document object. - Now we accept an optional `llm` to pass to `_extract_sections_basic`. """ try: # skip shortcuts or folders @@ -327,44 +215,50 @@ def convert_drive_item_to_document( return None # If it's a Google Doc, we might do advanced parsing - sections: list[Section] = [] + sections: list[TextSection | ImageSection] = [] + + # Try to get sections using the advanced method first if file.get("mimeType") == GDriveMimeType.DOC.value: try: - # get_document_sections is the advanced approach for Google Docs - sections = get_document_sections(docs_service, file["id"]) + doc_sections = get_document_sections( + docs_service=docs_service, doc_id=file.get("id", "") + ) + if doc_sections: + sections = cast(list[TextSection | ImageSection], doc_sections) except Exception as e: logger.warning( - f"Failed to pull google doc sections from '{file['name']}': {e}. " - "Falling back to basic extraction." + f"Error in advanced parsing: {e}. Falling back to basic extraction." ) - # If not a doc, or if we failed above, do our 'basic' approach + # If we don't have sections yet, use the basic extraction method if not sections: - sections = _extract_sections_basic(file, drive_service, image_analysis_llm) + sections = _extract_sections_basic(file, drive_service) + # If we still don't have any sections, skip this file if not sections: + logger.warning(f"No content extracted from {file.get('name')}. Skipping.") return None doc_id = file["webViewLink"] - updated_time = datetime.fromisoformat(file["modifiedTime"]).astimezone( - timezone.utc - ) + # Create the document return Document( id=doc_id, sections=sections, source=DocumentSource.GOOGLE_DRIVE, - semantic_identifier=file["name"], - doc_updated_at=updated_time, - metadata={}, # or any metadata from 'file' - additional_info=file.get("id"), + semantic_identifier=file.get("name", ""), + metadata={ + "owner_names": ", ".join( + owner.get("displayName", "") for owner in file.get("owners", []) + ), + }, + doc_updated_at=datetime.fromisoformat( + file.get("modifiedTime", "").replace("Z", "+00:00") + ), ) - except Exception as e: - logger.exception(f"Error converting file '{file.get('name')}' to Document: {e}") - if not CONTINUE_ON_CONNECTOR_FAILURE: - raise - return None + logger.error(f"Error converting file {file.get('name')}: {e}") + return None def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: diff --git a/backend/onyx/connectors/google_drive/section_extraction.py b/backend/onyx/connectors/google_drive/section_extraction.py index 136a05eaf..0c63eba5c 100644 --- a/backend/onyx/connectors/google_drive/section_extraction.py +++ b/backend/onyx/connectors/google_drive/section_extraction.py @@ -3,7 +3,7 @@ from typing import Any from pydantic import BaseModel from onyx.connectors.google_utils.resources import GoogleDocsService -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection class CurrentHeading(BaseModel): @@ -37,7 +37,7 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: def get_document_sections( docs_service: GoogleDocsService, doc_id: str, -) -> list[Section]: +) -> list[TextSection]: """Extracts sections from a Google Doc, including their headings and content""" # Fetch the document structure doc = docs_service.documents().get(documentId=doc_id).execute() @@ -45,7 +45,7 @@ def get_document_sections( # Get the content content = doc.get("body", {}).get("content", []) - sections: list[Section] = [] + sections: list[TextSection] = [] current_section: list[str] = [] current_heading: CurrentHeading | None = None @@ -70,7 +70,7 @@ def get_document_sections( heading_text = current_heading.text section_text = f"{heading_text}\n" + "\n".join(current_section) sections.append( - Section( + TextSection( text=section_text.strip(), link=_build_gdoc_section_link(doc_id, current_heading.id), ) @@ -96,7 +96,7 @@ def get_document_sections( if current_heading is not None and current_section: section_text = f"{current_heading.text}\n" + "\n".join(current_section) sections.append( - Section( + TextSection( text=section_text.strip(), link=_build_gdoc_section_link(doc_id, current_heading.id), ) diff --git a/backend/onyx/connectors/google_site/connector.py b/backend/onyx/connectors/google_site/connector.py index d38fee6ea..75e990af9 100644 --- a/backend/onyx/connectors/google_site/connector.py +++ b/backend/onyx/connectors/google_site/connector.py @@ -12,7 +12,7 @@ from onyx.configs.constants import DocumentSource from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.db.engine import get_sqlalchemy_engine from onyx.file_processing.extract_file_text import load_files_from_zip from onyx.file_processing.extract_file_text import read_text_file @@ -118,7 +118,7 @@ class GoogleSitesConnector(LoadConnector): source=DocumentSource.GOOGLE_SITES, semantic_identifier=title, sections=[ - Section( + TextSection( link=(self.base_url.rstrip("/") + "/" + path.lstrip("/")) if path else "", diff --git a/backend/onyx/connectors/guru/connector.py b/backend/onyx/connectors/guru/connector.py index b9f3bf597..4b488ef82 100644 --- a/backend/onyx/connectors/guru/connector.py +++ b/backend/onyx/connectors/guru/connector.py @@ -15,7 +15,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.logger import setup_logger @@ -120,7 +120,7 @@ class GuruConnector(LoadConnector, PollConnector): doc_batch.append( Document( id=card["id"], - sections=[Section(link=link, text=content_text)], + sections=[TextSection(link=link, text=content_text)], source=DocumentSource.GURU, semantic_identifier=title, doc_updated_at=latest_time, diff --git a/backend/onyx/connectors/hubspot/connector.py b/backend/onyx/connectors/hubspot/connector.py index 2e47f4c1d..88bfbb3ac 100644 --- a/backend/onyx/connectors/hubspot/connector.py +++ b/backend/onyx/connectors/hubspot/connector.py @@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger HUBSPOT_BASE_URL = "https://app.hubspot.com/contacts/" @@ -108,7 +108,7 @@ class HubSpotConnector(LoadConnector, PollConnector): doc_batch.append( Document( id=ticket.id, - sections=[Section(link=link, text=content_text)], + sections=[TextSection(link=link, text=content_text)], source=DocumentSource.HUBSPOT, semantic_identifier=title, # Is already in tzutc, just replacing the timezone format diff --git a/backend/onyx/connectors/interfaces.py b/backend/onyx/connectors/interfaces.py index 0b2f8b661..683881853 100644 --- a/backend/onyx/connectors/interfaces.py +++ b/backend/onyx/connectors/interfaces.py @@ -24,6 +24,8 @@ CheckpointOutput = Generator[Document | ConnectorFailure, None, ConnectorCheckpo class BaseConnector(abc.ABC): REDIS_KEY_PREFIX = "da_connector_data:" + # Common image file extensions supported across connectors + IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"} @abc.abstractmethod def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: diff --git a/backend/onyx/connectors/linear/connector.py b/backend/onyx/connectors/linear/connector.py index a3b8a59d6..24b6fdd58 100644 --- a/backend/onyx/connectors/linear/connector.py +++ b/backend/onyx/connectors/linear/connector.py @@ -21,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import request_with_retries @@ -237,22 +238,30 @@ class LinearConnector(LoadConnector, PollConnector, OAuthConnector): documents: list[Document] = [] for edge in edges: node = edge["node"] + # Create sections for description and comments + sections = [ + TextSection( + link=node["url"], + text=node["description"] or "", + ) + ] + + # Add comment sections + for comment in node["comments"]["nodes"]: + sections.append( + TextSection( + link=node["url"], + text=comment["body"] or "", + ) + ) + + # Cast the sections list to the expected type + typed_sections = cast(list[TextSection | ImageSection], sections) + documents.append( Document( id=node["id"], - sections=[ - Section( - link=node["url"], - text=node["description"] or "", - ) - ] - + [ - Section( - link=node["url"], - text=comment["body"] or "", - ) - for comment in node["comments"]["nodes"] - ], + sections=typed_sections, source=DocumentSource.LINEAR, semantic_identifier=f"[{node['identifier']}] {node['title']}", title=node["title"], diff --git a/backend/onyx/connectors/loopio/connector.py b/backend/onyx/connectors/loopio/connector.py index acf8c9123..93e639006 100644 --- a/backend/onyx/connectors/loopio/connector.py +++ b/backend/onyx/connectors/loopio/connector.py @@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.file_processing.html_utils import strip_excessive_newlines_and_spaces from onyx.utils.logger import setup_logger @@ -162,7 +162,7 @@ class LoopioConnector(LoadConnector, PollConnector): doc_batch.append( Document( id=str(entry["id"]), - sections=[Section(link=link, text=content_text)], + sections=[TextSection(link=link, text=content_text)], source=DocumentSource.LOOPIO, semantic_identifier=questions[0], doc_updated_at=latest_time, diff --git a/backend/onyx/connectors/mediawiki/wiki.py b/backend/onyx/connectors/mediawiki/wiki.py index 5e11d438a..479615861 100644 --- a/backend/onyx/connectors/mediawiki/wiki.py +++ b/backend/onyx/connectors/mediawiki/wiki.py @@ -6,6 +6,7 @@ import tempfile from collections.abc import Generator from collections.abc import Iterator from typing import Any +from typing import cast from typing import ClassVar import pywikibot.time # type: ignore[import-untyped] @@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.mediawiki.family import family_class_dispatch from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger @@ -60,14 +62,14 @@ def get_doc_from_page( sections_extracted: textlib.Content = textlib.extract_sections(page_text, site) sections = [ - Section( + TextSection( link=f"{page.full_url()}#" + section.heading.replace(" ", "_"), text=section.title + section.content, ) for section in sections_extracted.sections ] sections.append( - Section( + TextSection( link=page.full_url(), text=sections_extracted.header, ) @@ -79,7 +81,7 @@ def get_doc_from_page( doc_updated_at=pywikibot_timestamp_to_utc_datetime( page.latest_revision.timestamp ), - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), semantic_identifier=page.title(), metadata={"categories": [category.title() for category in page.categories()]}, id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}", diff --git a/backend/onyx/connectors/models.py b/backend/onyx/connectors/models.py index 6c5ba7691..00335cded 100644 --- a/backend/onyx/connectors/models.py +++ b/backend/onyx/connectors/models.py @@ -28,9 +28,25 @@ class ConnectorMissingCredentialError(PermissionError): class Section(BaseModel): + """Base section class with common attributes""" + + link: str | None = None + text: str | None = None + image_file_name: str | None = None + + +class TextSection(Section): + """Section containing text content""" + text: str link: str | None = None - image_file_name: str | None = None + + +class ImageSection(Section): + """Section containing an image reference""" + + image_file_name: str + link: str | None = None class BasicExpertInfo(BaseModel): @@ -100,7 +116,7 @@ class DocumentBase(BaseModel): """Used for Onyx ingestion api, the ID is inferred before use if not provided""" id: str | None = None - sections: list[Section] + sections: list[TextSection | ImageSection] source: DocumentSource | None = None semantic_identifier: str # displayed in the UI as the main identifier for the doc metadata: dict[str, str | list[str]] @@ -150,18 +166,10 @@ class DocumentBase(BaseModel): class Document(DocumentBase): - id: str # This must be unique or during indexing/reindexing, chunks will be overwritten - source: DocumentSource + """Used for Onyx ingestion api, the ID is required""" - def get_total_char_length(self) -> int: - """Calculate the total character length of the document including sections, metadata, and identifiers.""" - section_length = sum(len(section.text) for section in self.sections) - identifier_length = len(self.semantic_identifier) + len(self.title or "") - metadata_length = sum( - len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v) - for k, v in self.metadata.items() - ) - return section_length + identifier_length + metadata_length + id: str + source: DocumentSource def to_short_descriptor(self) -> str: """Used when logging the identity of a document""" @@ -185,6 +193,32 @@ class Document(DocumentBase): ) +class IndexingDocument(Document): + """Document with processed sections for indexing""" + + processed_sections: list[Section] = [] + + def get_total_char_length(self) -> int: + """Get the total character length of the document including processed sections""" + title_len = len(self.title or self.semantic_identifier) + + # Use processed_sections if available, otherwise fall back to original sections + if self.processed_sections: + section_len = sum( + len(section.text) if section.text is not None else 0 + for section in self.processed_sections + ) + else: + section_len = sum( + len(section.text) + if isinstance(section, TextSection) and section.text is not None + else 0 + for section in self.sections + ) + + return title_len + section_len + + class SlimDocument(BaseModel): id: str perm_sync_data: Any | None = None diff --git a/backend/onyx/connectors/notion/connector.py b/backend/onyx/connectors/notion/connector.py index 0a55cb226..678dd32e4 100644 --- a/backend/onyx/connectors/notion/connector.py +++ b/backend/onyx/connectors/notion/connector.py @@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.batching import batch_generator from onyx.utils.logger import setup_logger @@ -475,7 +475,7 @@ class NotionConnector(LoadConnector, PollConnector): Document( id=page.id, sections=[ - Section( + TextSection( link=f"{page.url}#{block.id.replace('-', '')}", text=block.prefix + block.text, ) diff --git a/backend/onyx/connectors/onyx_jira/connector.py b/backend/onyx/connectors/onyx_jira/connector.py index 1c784b5aa..30caf3ea5 100644 --- a/backend/onyx/connectors/onyx_jira/connector.py +++ b/backend/onyx/connectors/onyx_jira/connector.py @@ -23,8 +23,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.connectors.onyx_jira.utils import best_effort_basic_expert_info from onyx.connectors.onyx_jira.utils import best_effort_get_field_from_issue from onyx.connectors.onyx_jira.utils import build_jira_client @@ -145,7 +145,7 @@ def fetch_jira_issues_batch( yield Document( id=page_url, - sections=[Section(link=page_url, text=ticket_content)], + sections=[TextSection(link=page_url, text=ticket_content)], source=DocumentSource.JIRA, semantic_identifier=f"{issue.key}: {issue.fields.summary}", title=f"{issue.key} {issue.fields.summary}", diff --git a/backend/onyx/connectors/productboard/connector.py b/backend/onyx/connectors/productboard/connector.py index 1f8e8adaf..3bb58b7e4 100644 --- a/backend/onyx/connectors/productboard/connector.py +++ b/backend/onyx/connectors/productboard/connector.py @@ -16,7 +16,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger @@ -110,7 +110,7 @@ class ProductboardConnector(PollConnector): yield Document( id=feature["id"], sections=[ - Section( + TextSection( link=feature["links"]["html"], text=self._parse_description_html(feature["description"]), ) @@ -133,7 +133,7 @@ class ProductboardConnector(PollConnector): yield Document( id=component["id"], sections=[ - Section( + TextSection( link=component["links"]["html"], text=self._parse_description_html(component["description"]), ) @@ -159,7 +159,7 @@ class ProductboardConnector(PollConnector): yield Document( id=product["id"], sections=[ - Section( + TextSection( link=product["links"]["html"], text=self._parse_description_html(product["description"]), ) @@ -189,7 +189,7 @@ class ProductboardConnector(PollConnector): yield Document( id=objective["id"], sections=[ - Section( + TextSection( link=objective["links"]["html"], text=self._parse_description_html(objective["description"]), ) diff --git a/backend/onyx/connectors/salesforce/connector.py b/backend/onyx/connectors/salesforce/connector.py index 0c7437971..6fa17c327 100644 --- a/backend/onyx/connectors/salesforce/connector.py +++ b/backend/onyx/connectors/salesforce/connector.py @@ -13,6 +13,7 @@ from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.connectors.salesforce.doc_conversion import convert_sf_object_to_doc from onyx.connectors.salesforce.doc_conversion import ID_PREFIX from onyx.connectors.salesforce.salesforce_calls import fetch_all_csvs_in_parallel @@ -218,7 +219,8 @@ if __name__ == "__main__": for doc in doc_batch: section_count += len(doc.sections) for section in doc.sections: - text_count += len(section.text) + if isinstance(section, TextSection) and section.text is not None: + text_count += len(section.text) end_time = time.time() print(f"Doc count: {doc_count}") diff --git a/backend/onyx/connectors/salesforce/doc_conversion.py b/backend/onyx/connectors/salesforce/doc_conversion.py index c9f48a9ec..1e83b5dd1 100644 --- a/backend/onyx/connectors/salesforce/doc_conversion.py +++ b/backend/onyx/connectors/salesforce/doc_conversion.py @@ -1,10 +1,12 @@ import re +from typing import cast from onyx.configs.constants import DocumentSource from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.connectors.salesforce.sqlite_functions import get_child_ids from onyx.connectors.salesforce.sqlite_functions import get_record from onyx.connectors.salesforce.utils import SalesforceObject @@ -114,8 +116,8 @@ def _extract_dict_text(raw_dict: dict) -> str: return natural_language_for_dict -def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Section: - return Section( +def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> TextSection: + return TextSection( text=_extract_dict_text(salesforce_object.data), link=f"{base_url}/{salesforce_object.id}", ) @@ -175,7 +177,7 @@ def convert_sf_object_to_doc( doc = Document( id=onyx_salesforce_id, - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.SALESFORCE, semantic_identifier=extracted_semantic_identifier, doc_updated_at=extracted_doc_updated_at, diff --git a/backend/onyx/connectors/sharepoint/connector.py b/backend/onyx/connectors/sharepoint/connector.py index f7e425149..5e35bf871 100644 --- a/backend/onyx/connectors/sharepoint/connector.py +++ b/backend/onyx/connectors/sharepoint/connector.py @@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import extract_file_text from onyx.utils.logger import setup_logger @@ -55,7 +55,7 @@ def _convert_driveitem_to_document( doc = Document( id=driveitem.id, - sections=[Section(link=driveitem.web_url, text=file_text)], + sections=[TextSection(link=driveitem.web_url, text=file_text)], source=DocumentSource.SHAREPOINT, semantic_identifier=driveitem.name, doc_updated_at=driveitem.last_modified_datetime.replace(tzinfo=timezone.utc), diff --git a/backend/onyx/connectors/slab/connector.py b/backend/onyx/connectors/slab/connector.py index 18b2500a5..cd508f212 100644 --- a/backend/onyx/connectors/slab/connector.py +++ b/backend/onyx/connectors/slab/connector.py @@ -19,8 +19,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.utils.logger import setup_logger @@ -212,7 +212,7 @@ class SlabConnector(LoadConnector, PollConnector, SlimConnector): doc_batch.append( Document( id=post_id, # can't be url as this changes with the post title - sections=[Section(link=page_url, text=content_text)], + sections=[TextSection(link=page_url, text=content_text)], source=DocumentSource.SLAB, semantic_identifier=post["title"], metadata={}, diff --git a/backend/onyx/connectors/slack/connector.py b/backend/onyx/connectors/slack/connector.py index eda30a015..ff6ea80c7 100644 --- a/backend/onyx/connectors/slack/connector.py +++ b/backend/onyx/connectors/slack/connector.py @@ -34,8 +34,8 @@ from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document from onyx.connectors.models import DocumentFailure from onyx.connectors.models import EntityFailure -from onyx.connectors.models import Section from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.connectors.slack.utils import expert_info_from_slack_id from onyx.connectors.slack.utils import get_message_link from onyx.connectors.slack.utils import make_paginated_slack_api_call_w_retries @@ -211,7 +211,7 @@ def thread_to_doc( return Document( id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]), sections=[ - Section( + TextSection( link=get_message_link(event=m, client=client, channel_id=channel_id), text=slack_cleaner.index_clean(cast(str, m["text"])), ) diff --git a/backend/onyx/connectors/teams/connector.py b/backend/onyx/connectors/teams/connector.py index f5bea12fd..165164e57 100644 --- a/backend/onyx/connectors/teams/connector.py +++ b/backend/onyx/connectors/teams/connector.py @@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.logger import setup_logger @@ -165,7 +165,7 @@ def _convert_thread_to_document( doc = Document( id=post_id, - sections=[Section(link=web_url, text=thread_text)], + sections=[TextSection(link=web_url, text=thread_text)], source=DocumentSource.TEAMS, semantic_identifier=semantic_string, title="", # teams threads don't really have a "title" diff --git a/backend/onyx/connectors/vision_enabled_connector.py b/backend/onyx/connectors/vision_enabled_connector.py deleted file mode 100644 index 03b523f26..000000000 --- a/backend/onyx/connectors/vision_enabled_connector.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -Mixin for connectors that need vision capabilities. -""" -from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled -from onyx.llm.factory import get_default_llm_with_vision -from onyx.llm.interfaces import LLM -from onyx.utils.logger import setup_logger - -logger = setup_logger() - - -class VisionEnabledConnector: - """ - Mixin for connectors that need vision capabilities. - - This mixin provides a standard way to initialize a vision-capable LLM - for image analysis during indexing. - - Usage: - class MyConnector(LoadConnector, VisionEnabledConnector): - def __init__(self, ...): - super().__init__(...) - self.initialize_vision_llm() - """ - - def initialize_vision_llm(self) -> None: - """ - Initialize a vision-capable LLM if enabled by configuration. - - Sets self.image_analysis_llm to the LLM instance or None if disabled. - """ - self.image_analysis_llm: LLM | None = None - - if get_image_extraction_and_analysis_enabled(): - try: - self.image_analysis_llm = get_default_llm_with_vision() - if self.image_analysis_llm is None: - logger.warning( - "No LLM with vision found; image summarization will be disabled" - ) - except Exception as e: - logger.warning( - f"Failed to initialize vision LLM due to an error: {str(e)}. " - "Image summarization will be disabled." - ) - self.image_analysis_llm = None diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index f92ed3710..1209915a1 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -32,7 +32,7 @@ from onyx.connectors.exceptions import UnexpectedValidationError from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import read_pdf_file from onyx.file_processing.html_utils import web_html_cleanup from onyx.utils.logger import setup_logger @@ -341,7 +341,7 @@ class WebConnector(LoadConnector): doc_batch.append( Document( id=initial_url, - sections=[Section(link=initial_url, text=page_text)], + sections=[TextSection(link=initial_url, text=page_text)], source=DocumentSource.WEB, semantic_identifier=initial_url.split("/")[-1], metadata=metadata, @@ -443,7 +443,7 @@ class WebConnector(LoadConnector): Document( id=initial_url, sections=[ - Section(link=initial_url, text=parsed_html.cleaned_text) + TextSection(link=initial_url, text=parsed_html.cleaned_text) ], source=DocumentSource.WEB, semantic_identifier=parsed_html.title or initial_url, diff --git a/backend/onyx/connectors/xenforo/connector.py b/backend/onyx/connectors/xenforo/connector.py index f460ab958..6eb3866ec 100644 --- a/backend/onyx/connectors/xenforo/connector.py +++ b/backend/onyx/connectors/xenforo/connector.py @@ -28,7 +28,7 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger logger = setup_logger() @@ -104,7 +104,7 @@ def scrape_page_posts( # id. We may want to de-dupe this stuff inside the indexing service. document = Document( id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}", - sections=[Section(link=url, text=post_text)], + sections=[TextSection(link=url, text=post_text)], title=title, source=DocumentSource.XENFORO, semantic_identifier=title, diff --git a/backend/onyx/connectors/zendesk/connector.py b/backend/onyx/connectors/zendesk/connector.py index c4d6fb092..9c37c242c 100644 --- a/backend/onyx/connectors/zendesk/connector.py +++ b/backend/onyx/connectors/zendesk/connector.py @@ -1,5 +1,6 @@ from collections.abc import Iterator from typing import Any +from typing import cast import requests @@ -17,8 +18,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document -from onyx.connectors.models import Section from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection from onyx.file_processing.html_utils import parse_html_page_basic from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.utils.retry_wrapper import retry_builder @@ -168,8 +169,8 @@ def _article_to_document( return new_author_mapping, Document( id=f"article:{article['id']}", sections=[ - Section( - link=article.get("html_url"), + TextSection( + link=cast(str, article.get("html_url")), text=parse_html_page_basic(article["body"]), ) ], @@ -268,7 +269,7 @@ def _ticket_to_document( return new_author_mapping, Document( id=f"zendesk_ticket_{ticket['id']}", - sections=[Section(link=ticket_display_url, text=full_text)], + sections=[TextSection(link=ticket_display_url, text=full_text)], source=DocumentSource.ZENDESK, semantic_identifier=f"Ticket #{ticket['id']}: {subject or 'No Subject'}", doc_updated_at=update_time, diff --git a/backend/onyx/connectors/zulip/connector.py b/backend/onyx/connectors/zulip/connector.py index 700a3962c..92a44293b 100644 --- a/backend/onyx/connectors/zulip/connector.py +++ b/backend/onyx/connectors/zulip/connector.py @@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.connectors.zulip.schemas import GetMessagesResponse from onyx.connectors.zulip.schemas import Message from onyx.connectors.zulip.utils import build_search_narrow @@ -161,7 +161,7 @@ class ZulipConnector(LoadConnector, PollConnector): return Document( id=f"{message.stream_id}__{message.id}", sections=[ - Section( + TextSection( link=self._message_to_narrow_link(message), text=text, ) diff --git a/backend/onyx/db/pg_file_store.py b/backend/onyx/db/pg_file_store.py index 5d31358b4..7096d92c5 100644 --- a/backend/onyx/db/pg_file_store.py +++ b/backend/onyx/db/pg_file_store.py @@ -67,6 +67,9 @@ def read_lobj( use_tempfile: bool = False, ) -> IO: pg_conn = get_pg_conn_from_session(db_session) + # Ensure we're using binary mode by default for large objects + if mode is None: + mode = "rb" large_object = ( pg_conn.lobject(lobj_oid, mode=mode) if mode else pg_conn.lobject(lobj_oid) ) @@ -81,6 +84,7 @@ def read_lobj( temp_file.seek(0) return temp_file else: + # Ensure we're getting raw bytes without text decoding return BytesIO(large_object.read()) diff --git a/backend/onyx/file_processing/image_utils.py b/backend/onyx/file_processing/image_utils.py index 3ae48df89..25885a217 100644 --- a/backend/onyx/file_processing/image_utils.py +++ b/backend/onyx/file_processing/image_utils.py @@ -2,12 +2,9 @@ from typing import Tuple from sqlalchemy.orm import Session -from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from onyx.configs.constants import FileOrigin -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection from onyx.db.pg_file_store import save_bytes_to_pgfilestore -from onyx.file_processing.image_summarization import summarize_image_with_error_handling -from onyx.llm.interfaces import LLM from onyx.utils.logger import setup_logger logger = setup_logger() @@ -18,12 +15,12 @@ def store_image_and_create_section( image_data: bytes, file_name: str, display_name: str, - media_type: str = "image/unknown", - llm: LLM | None = None, + link: str | None = None, + media_type: str = "application/octet-stream", file_origin: FileOrigin = FileOrigin.OTHER, -) -> Tuple[Section, str | None]: +) -> Tuple[ImageSection, str | None]: """ - Stores an image in PGFileStore and creates a Section object with optional summarization. + Stores an image in PGFileStore and creates an ImageSection object without summarization. Args: db_session: Database session @@ -31,12 +28,11 @@ def store_image_and_create_section( file_name: Base identifier for the file display_name: Human-readable name for the image media_type: MIME type of the image - llm: Optional LLM with vision capabilities for summarization file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.) Returns: Tuple containing: - - Section object with image reference and optional summary text + - ImageSection object with image reference - The file_name in PGFileStore or None if storage failed """ # Storage logic @@ -53,18 +49,10 @@ def store_image_and_create_section( stored_file_name = pgfilestore.file_name except Exception as e: logger.error(f"Failed to store image: {e}") - if not CONTINUE_ON_CONNECTOR_FAILURE: - raise - return Section(text=""), None - - # Summarization logic - summary_text = "" - if llm: - summary_text = ( - summarize_image_with_error_handling(llm, image_data, display_name) or "" - ) + raise e + # Create an ImageSection with empty text (will be filled by LLM later in the pipeline) return ( - Section(text=summary_text, image_file_name=stored_file_name), + ImageSection(image_file_name=stored_file_name, link=link), stored_file_name, ) diff --git a/backend/onyx/indexing/chunker.py b/backend/onyx/indexing/chunker.py index 91c850592..0dea6fa12 100644 --- a/backend/onyx/indexing/chunker.py +++ b/backend/onyx/indexing/chunker.py @@ -9,7 +9,8 @@ from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE from onyx.connectors.cross_connector_utils.miscellaneous_utils import ( get_metadata_keys_to_ignore, ) -from onyx.connectors.models import Document +from onyx.connectors.models import IndexingDocument +from onyx.connectors.models import Section from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.indexing.models import DocAwareChunk from onyx.natural_language_processing.utils import BaseTokenizer @@ -64,7 +65,7 @@ def _get_metadata_suffix_for_document_index( def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwareChunk: """ - Combines multiple DocAwareChunks into one large chunk (for “multipass” mode), + Combines multiple DocAwareChunks into one large chunk (for "multipass" mode), appending the content and adjusting source_links accordingly. """ merged_chunk = DocAwareChunk( @@ -98,7 +99,7 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar def generate_large_chunks(chunks: list[DocAwareChunk]) -> list[DocAwareChunk]: """ - Generates larger “grouped” chunks by combining sets of smaller chunks. + Generates larger "grouped" chunks by combining sets of smaller chunks. """ large_chunks = [] for idx, i in enumerate(range(0, len(chunks), LARGE_CHUNK_RATIO)): @@ -185,7 +186,7 @@ class Chunker: def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None: """ - For “multipass” mode: additional sub-chunks (mini-chunks) for use in certain embeddings. + For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings. """ if self.mini_chunk_splitter and chunk_text.strip(): return self.mini_chunk_splitter.split_text(chunk_text) @@ -194,7 +195,7 @@ class Chunker: # ADDED: extra param image_url to store in the chunk def _create_chunk( self, - document: Document, + document: IndexingDocument, chunks_list: list[DocAwareChunk], text: str, links: dict[int, str], @@ -225,7 +226,29 @@ class Chunker: def _chunk_document( self, - document: Document, + document: IndexingDocument, + title_prefix: str, + metadata_suffix_semantic: str, + metadata_suffix_keyword: str, + content_token_limit: int, + ) -> list[DocAwareChunk]: + """ + Legacy method for backward compatibility. + Calls _chunk_document_with_sections with document.sections. + """ + return self._chunk_document_with_sections( + document, + document.processed_sections, + title_prefix, + metadata_suffix_semantic, + metadata_suffix_keyword, + content_token_limit, + ) + + def _chunk_document_with_sections( + self, + document: IndexingDocument, + sections: list[Section], title_prefix: str, metadata_suffix_semantic: str, metadata_suffix_keyword: str, @@ -233,17 +256,16 @@ class Chunker: ) -> list[DocAwareChunk]: """ Loops through sections of the document, converting them into one or more chunks. - If a section has an image_link, we treat it as a dedicated chunk. + Works with processed sections that are base Section objects. """ - chunks: list[DocAwareChunk] = [] link_offsets: dict[int, str] = {} chunk_text = "" - for section_idx, section in enumerate(document.sections): - section_text = clean_text(section.text) + for section_idx, section in enumerate(sections): + # Get section text and other attributes + section_text = clean_text(section.text or "") section_link_text = section.link or "" - # ADDED: if the Section has an image link image_url = section.image_file_name # If there is no useful content, skip @@ -254,7 +276,7 @@ class Chunker: ) continue - # CASE 1: If this is an image section, force a separate chunk + # CASE 1: If this section has an image, force a separate chunk if image_url: # First, if we have any partially built text chunk, finalize it if chunk_text.strip(): @@ -271,15 +293,13 @@ class Chunker: chunk_text = "" link_offsets = {} - # Create a chunk specifically for this image - # (If the section has text describing the image, use that as content) + # Create a chunk specifically for this image section + # (Using the text summary that was generated during processing) self._create_chunk( document, chunks, section_text, - links={0: section_link_text} - if section_link_text - else {}, # No text offsets needed for images + links={0: section_link_text} if section_link_text else {}, image_file_name=image_url, title_prefix=title_prefix, metadata_suffix_semantic=metadata_suffix_semantic, @@ -384,7 +404,9 @@ class Chunker: ) return chunks - def _handle_single_document(self, document: Document) -> list[DocAwareChunk]: + def _handle_single_document( + self, document: IndexingDocument + ) -> list[DocAwareChunk]: # Specifically for reproducing an issue with gmail if document.source == DocumentSource.GMAIL: logger.debug(f"Chunking {document.semantic_identifier}") @@ -420,26 +442,31 @@ class Chunker: title_prefix = "" metadata_suffix_semantic = "" - # Chunk the document - normal_chunks = self._chunk_document( + # Use processed_sections if available (IndexingDocument), otherwise use original sections + sections_to_chunk = document.processed_sections + + normal_chunks = self._chunk_document_with_sections( document, + sections_to_chunk, title_prefix, metadata_suffix_semantic, metadata_suffix_keyword, content_token_limit, ) - # Optional “multipass” large chunk creation + # Optional "multipass" large chunk creation if self.enable_multipass and self.enable_large_chunks: large_chunks = generate_large_chunks(normal_chunks) normal_chunks.extend(large_chunks) return normal_chunks - def chunk(self, documents: list[Document]) -> list[DocAwareChunk]: + def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]: """ Takes in a list of documents and chunks them into smaller chunks for indexing while persisting the document metadata. + + Works with both standard Document objects and IndexingDocument objects with processed_sections. """ final_chunks: list[DocAwareChunk] = [] for document in documents: diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index 9da1c58af..f4a6e0075 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -10,13 +10,18 @@ from onyx.access.access import get_access_for_documents from onyx.access.models import DocumentAccess from onyx.configs.app_configs import MAX_DOCUMENT_CHARS from onyx.configs.constants import DEFAULT_BOOST +from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled from onyx.connectors.cross_connector_utils.miscellaneous_utils import ( get_experts_stores_representations, ) from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document from onyx.connectors.models import DocumentFailure +from onyx.connectors.models import ImageSection from onyx.connectors.models import IndexAttemptMetadata +from onyx.connectors.models import IndexingDocument +from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.db.document import fetch_chunk_counts_for_documents from onyx.db.document import get_documents_by_ids from onyx.db.document import mark_document_as_indexed_for_cc_pair__no_commit @@ -27,7 +32,10 @@ from onyx.db.document import update_docs_updated_at__no_commit from onyx.db.document import upsert_document_by_connector_credential_pair from onyx.db.document import upsert_documents from onyx.db.document_set import fetch_document_sets_for_documents +from onyx.db.engine import get_session_with_current_tenant from onyx.db.models import Document as DBDocument +from onyx.db.pg_file_store import get_pgfilestore_by_file_name +from onyx.db.pg_file_store import read_lobj from onyx.db.search_settings import get_current_search_settings from onyx.db.tag import create_or_add_document_tag from onyx.db.tag import create_or_add_document_tag_list @@ -37,6 +45,7 @@ from onyx.document_index.document_index_utils import ( from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentMetadata from onyx.document_index.interfaces import IndexBatchParams +from onyx.file_processing.image_summarization import summarize_image_with_error_handling from onyx.indexing.chunker import Chunker from onyx.indexing.embedder import embed_chunks_with_failure_handling from onyx.indexing.embedder import IndexingEmbedder @@ -44,6 +53,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.indexing.models import DocAwareChunk from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff +from onyx.llm.factory import get_default_llm_with_vision from onyx.utils.logger import setup_logger from onyx.utils.timing import log_function_time @@ -53,6 +63,7 @@ logger = setup_logger() class DocumentBatchPrepareContext(BaseModel): updatable_docs: list[Document] id_to_db_doc_map: dict[str, DBDocument] + indexable_docs: list[IndexingDocument] = [] model_config = ConfigDict(arbitrary_types_allowed=True) @@ -265,7 +276,12 @@ def index_doc_batch_prepare( def filter_documents(document_batch: list[Document]) -> list[Document]: documents: list[Document] = [] for document in document_batch: - empty_contents = not any(section.text.strip() for section in document.sections) + empty_contents = not any( + isinstance(section, TextSection) + and section.text is not None + and section.text.strip() + for section in document.sections + ) if ( (not document.title or not document.title.strip()) and not document.semantic_identifier.strip() @@ -288,7 +304,12 @@ def filter_documents(document_batch: list[Document]) -> list[Document]: ) continue - section_chars = sum(len(section.text) for section in document.sections) + section_chars = sum( + len(section.text) + if isinstance(section, TextSection) and section.text is not None + else 0 + for section in document.sections + ) if ( MAX_DOCUMENT_CHARS and len(document.title or document.semantic_identifier) + section_chars @@ -308,6 +329,121 @@ def filter_documents(document_batch: list[Document]) -> list[Document]: return documents +def process_image_sections(documents: list[Document]) -> list[IndexingDocument]: + """ + Process all sections in documents by: + 1. Converting both TextSection and ImageSection objects to base Section objects + 2. Processing ImageSections to generate text summaries using a vision-capable LLM + 3. Returning IndexingDocument objects with both original and processed sections + + Args: + documents: List of documents with TextSection | ImageSection objects + + Returns: + List of IndexingDocument objects with processed_sections as list[Section] + """ + # Check if image extraction and analysis is enabled before trying to get a vision LLM + if not get_image_extraction_and_analysis_enabled(): + llm = None + else: + # Only get the vision LLM if image processing is enabled + llm = get_default_llm_with_vision() + + if not llm: + logger.warning( + "No vision-capable LLM available. Image sections will not be processed." + ) + + # Even without LLM, we still convert to IndexingDocument with base Sections + return [ + IndexingDocument( + **document.dict(), + processed_sections=[ + Section( + text=section.text if isinstance(section, TextSection) else None, + link=section.link, + image_file_name=section.image_file_name + if isinstance(section, ImageSection) + else None, + ) + for section in document.sections + ], + ) + for document in documents + ] + + indexed_documents: list[IndexingDocument] = [] + + for document in documents: + processed_sections: list[Section] = [] + + for section in document.sections: + # For ImageSection, process and create base Section with both text and image_file_name + if isinstance(section, ImageSection): + # Default section with image path preserved + processed_section = Section( + link=section.link, + image_file_name=section.image_file_name, + text=None, # Will be populated if summarization succeeds + ) + + # Try to get image summary + try: + with get_session_with_current_tenant() as db_session: + pgfilestore = get_pgfilestore_by_file_name( + file_name=section.image_file_name, db_session=db_session + ) + + if not pgfilestore: + logger.warning( + f"Image file {section.image_file_name} not found in PGFileStore" + ) + + processed_section.text = "[Image could not be processed]" + else: + # Get the image data + image_data_io = read_lobj( + pgfilestore.lobj_oid, db_session, mode="rb" + ) + pgfilestore_data = image_data_io.read() + summary = summarize_image_with_error_handling( + llm=llm, + image_data=pgfilestore_data, + context_name=pgfilestore.display_name or "Image", + ) + + if summary: + processed_section.text = summary + else: + processed_section.text = ( + "[Image could not be summarized]" + ) + except Exception as e: + logger.error(f"Error processing image section: {e}") + processed_section.text = "[Error processing image]" + + processed_sections.append(processed_section) + + # For TextSection, create a base Section with text and link + elif isinstance(section, TextSection): + processed_section = Section( + text=section.text, link=section.link, image_file_name=None + ) + processed_sections.append(processed_section) + + # If it's already a base Section (unlikely), just append it + else: + processed_sections.append(section) + + # Create IndexingDocument with original sections and processed_sections + indexed_document = IndexingDocument( + **document.dict(), processed_sections=processed_sections + ) + indexed_documents.append(indexed_document) + + return indexed_documents + + @log_function_time(debug_only=True) def index_doc_batch( *, @@ -362,19 +498,23 @@ def index_doc_batch( failures=[], ) + # Convert documents to IndexingDocument objects with processed section + # logger.debug("Processing image sections") + ctx.indexable_docs = process_image_sections(ctx.updatable_docs) + doc_descriptors = [ { "doc_id": doc.id, "doc_length": doc.get_total_char_length(), } - for doc in ctx.updatable_docs + for doc in ctx.indexable_docs ] logger.debug(f"Starting indexing process for documents: {doc_descriptors}") logger.debug("Starting chunking") # NOTE: no special handling for failures here, since the chunker is not # a common source of failure for the indexing pipeline - chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs) + chunks: list[DocAwareChunk] = chunker.chunk(ctx.indexable_docs) logger.debug("Starting embedding") chunks_with_embeddings, embedding_failures = ( diff --git a/backend/onyx/seeding/load_docs.py b/backend/onyx/seeding/load_docs.py index b415c2920..44a6a6112 100644 --- a/backend/onyx/seeding/load_docs.py +++ b/backend/onyx/seeding/load_docs.py @@ -15,7 +15,7 @@ from onyx.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL from onyx.connectors.models import Document from onyx.connectors.models import IndexAttemptMetadata from onyx.connectors.models import InputType -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.db.connector import check_connectors_exist from onyx.db.connector import create_connector from onyx.db.connector_credential_pair import add_credential_to_connector @@ -55,7 +55,7 @@ def _create_indexable_chunks( # The section is not really used past this point since we have already done the other processing # for the chunking and embedding. sections=[ - Section( + TextSection( text=preprocessed_doc["content"], link=preprocessed_doc["url"], image_file_name=None, diff --git a/backend/tests/daily/connectors/airtable/test_airtable_basic.py b/backend/tests/daily/connectors/airtable/test_airtable_basic.py index 788543fe0..5a5719893 100644 --- a/backend/tests/daily/connectors/airtable/test_airtable_basic.py +++ b/backend/tests/daily/connectors/airtable/test_airtable_basic.py @@ -1,4 +1,5 @@ import os +from typing import cast from unittest.mock import MagicMock import pytest @@ -7,7 +8,8 @@ from pydantic import BaseModel from onyx.configs.constants import DocumentSource from onyx.connectors.airtable.airtable_connector import AirtableConnector from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection BASE_VIEW_ID = "viwVUEJjWPd8XYjh8" @@ -76,11 +78,11 @@ def create_test_document( if not all_fields_as_metadata: sections.extend( [ - Section( + TextSection( text=f"Title:\n------------------------\n{title}\n------------------------", link=f"{link_base}/{id}", ), - Section( + TextSection( text=f"Description:\n------------------------\n{description}\n------------------------", link=f"{link_base}/{id}", ), @@ -90,7 +92,7 @@ def create_test_document( if attachments: for attachment_text, attachment_link in attachments: sections.append( - Section( + TextSection( text=f"Attachment:\n------------------------\n{attachment_text}\n------------------------", link=attachment_link, ), @@ -122,7 +124,7 @@ def create_test_document( return Document( id=f"airtable__{id}", - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.AIRTABLE, semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}", metadata=metadata, diff --git a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py index 7c998c4f1..162b56062 100644 --- a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py +++ b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py @@ -49,6 +49,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: # Content specific checks content = section.text + assert content is not None, "Section text should not be None" # Check for specific content elements assert "* Fruit Shopping List:" in content @@ -82,6 +83,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: assert nested1.semantic_identifier == "Nested1" assert len(nested1.sections) == 1 # extra newlines at the end, remove them to make test easier + assert nested1.sections[0].text is not None assert nested1.sections[0].text.strip() == "nested1" assert nested1.source == DocumentSource.GITBOOK @@ -89,6 +91,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: assert nested2.id.startswith("gitbook-") assert nested2.semantic_identifier == "Nested2" assert len(nested2.sections) == 1 + assert nested2.sections[0].text is not None assert nested2.sections[0].text.strip() == "nested2" assert nested2.source == DocumentSource.GITBOOK diff --git a/backend/tests/daily/connectors/google_drive/consts_and_utils.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py index f069bdaf7..60bbca323 100644 --- a/backend/tests/daily/connectors/google_drive/consts_and_utils.py +++ b/backend/tests/daily/connectors/google_drive/consts_and_utils.py @@ -1,6 +1,7 @@ from collections.abc import Sequence from onyx.connectors.models import Document +from onyx.connectors.models import TextSection ALL_FILES = list(range(0, 60)) SHARED_DRIVE_FILES = list(range(20, 25)) @@ -177,7 +178,13 @@ def assert_retrieved_docs_match_expected( ) valid_retrieved_texts = set( [ - " - ".join([section.text for section in doc.sections]) + " - ".join( + [ + section.text + for section in doc.sections + if isinstance(section, TextSection) and section.text is not None + ] + ) for doc in valid_retrieved_docs ] ) diff --git a/backend/tests/daily/connectors/notion/test_notion_connector.py b/backend/tests/daily/connectors/notion/test_notion_connector.py index 32779c1b8..11b86bd48 100644 --- a/backend/tests/daily/connectors/notion/test_notion_connector.py +++ b/backend/tests/daily/connectors/notion/test_notion_connector.py @@ -97,6 +97,7 @@ def test_notion_connector_basic(notion_connector: NotionConnector) -> None: assert child2_section.link.startswith("https://www.notion.so/") # Database section checks for child2 + assert child2_db_section.text is not None assert child2_db_section.text.strip() != "" # Should contain some database content assert child2_db_section.link is not None assert child2_db_section.link.startswith("https://www.notion.so/") diff --git a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py index e01e92666..a25c04973 100644 --- a/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py +++ b/backend/tests/daily/connectors/sharepoint/test_sharepoint_connector.py @@ -60,6 +60,7 @@ def verify_document_content(doc: Document, expected: ExpectedDocument) -> None: """Verify a document matches its expected content.""" assert doc.semantic_identifier == expected.semantic_identifier assert len(doc.sections) == 1 + assert doc.sections[0].text is not None assert expected.content in doc.sections[0].text verify_document_metadata(doc) diff --git a/backend/tests/daily/connectors/slab/test_slab_connector.py b/backend/tests/daily/connectors/slab/test_slab_connector.py index e5722cdc5..702bc3e3d 100644 --- a/backend/tests/daily/connectors/slab/test_slab_connector.py +++ b/backend/tests/daily/connectors/slab/test_slab_connector.py @@ -63,6 +63,7 @@ def test_slab_connector_basic(slab_connector: SlabConnector) -> None: assert len(target_test_doc.sections) == 1 section = target_test_doc.sections[0] # Need to replace the weird apostrophe with a normal one + assert section.text is not None assert section.text.replace("\u2019", "'") == desired_test_data["section_text"] assert section.link == desired_test_data["link"] diff --git a/backend/tests/daily/connectors/web/test_web_connector.py b/backend/tests/daily/connectors/web/test_web_connector.py index 42f7a5f79..69012790f 100644 --- a/backend/tests/daily/connectors/web/test_web_connector.py +++ b/backend/tests/daily/connectors/web/test_web_connector.py @@ -32,6 +32,7 @@ def test_web_connector_scroll(web_connector: WebConnector) -> None: assert len(all_docs) == 1 doc = all_docs[0] + assert doc.sections[0].text is not None assert EXPECTED_QUOTE in doc.sections[0].text @@ -45,4 +46,5 @@ def test_web_connector_no_scroll(web_connector: WebConnector) -> None: assert len(all_docs) == 1 doc = all_docs[0] + assert doc.sections[0].text is not None assert EXPECTED_QUOTE not in doc.sections[0].text diff --git a/backend/tests/integration/common_utils/test_document_utils.py b/backend/tests/integration/common_utils/test_document_utils.py index 234968e96..d918d54e1 100644 --- a/backend/tests/integration/common_utils/test_document_utils.py +++ b/backend/tests/integration/common_utils/test_document_utils.py @@ -6,7 +6,7 @@ from onyx.configs.constants import DocumentSource from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document from onyx.connectors.models import DocumentFailure -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection def create_test_document( @@ -28,7 +28,7 @@ def create_test_document( doc_id = doc_id or f"test-doc-{uuid.uuid4()}" return Document( id=doc_id, - sections=[Section(text=text, link=link)], + sections=[TextSection(text=text, link=link)], source=source, semantic_identifier=doc_id, doc_updated_at=datetime.now(timezone.utc), diff --git a/backend/tests/unit/onyx/connectors/jira/test_large_ticket_handling.py b/backend/tests/unit/onyx/connectors/jira/test_large_ticket_handling.py index 8f89a86c7..c8ae925ce 100644 --- a/backend/tests/unit/onyx/connectors/jira/test_large_ticket_handling.py +++ b/backend/tests/unit/onyx/connectors/jira/test_large_ticket_handling.py @@ -83,6 +83,7 @@ def test_fetch_jira_issues_batch_small_ticket( assert len(docs) == 1 assert docs[0].id.endswith("/SMALL-1") + assert docs[0].sections[0].text is not None assert "Small description" in docs[0].sections[0].text assert "Small comment 1" in docs[0].sections[0].text assert "Small comment 2" in docs[0].sections[0].text diff --git a/backend/tests/unit/onyx/connectors/mediawiki/test_wiki.py b/backend/tests/unit/onyx/connectors/mediawiki/test_wiki.py index 6a8100883..9590cd4bb 100644 --- a/backend/tests/unit/onyx/connectors/mediawiki/test_wiki.py +++ b/backend/tests/unit/onyx/connectors/mediawiki/test_wiki.py @@ -99,7 +99,8 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None: doc.sections, test_page._sections_helper + [test_page.header] ): assert ( - section.text.strip() == expected_section.strip() + section.text is not None + and section.text.strip() == expected_section.strip() ) # Extra whitespace before/after is okay assert section.link and section.link.startswith(test_page.full_url()) assert doc.semantic_identifier == test_page.title() diff --git a/backend/tests/unit/onyx/indexing/test_chunker.py b/backend/tests/unit/onyx/indexing/test_chunker.py index 612474eda..57ba3fe12 100644 --- a/backend/tests/unit/onyx/indexing/test_chunker.py +++ b/backend/tests/unit/onyx/indexing/test_chunker.py @@ -2,9 +2,10 @@ import pytest from onyx.configs.constants import DocumentSource from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.indexing.chunker import Chunker from onyx.indexing.embedder import DefaultIndexingEmbedder +from onyx.indexing.indexing_pipeline import process_image_sections from tests.unit.onyx.indexing.conftest import MockHeartbeat @@ -35,19 +36,20 @@ def test_chunk_document(embedder: DefaultIndexingEmbedder) -> None: metadata={"tags": ["tag1", "tag2"]}, doc_updated_at=None, sections=[ - Section(text=short_section_1, link="link1"), - Section(text=short_section_2, link="link2"), - Section(text=long_section, link="link3"), - Section(text=short_section_3, link="link4"), - Section(text=short_section_4, link="link5"), + TextSection(text=short_section_1, link="link1"), + TextSection(text=short_section_2, link="link2"), + TextSection(text=long_section, link="link3"), + TextSection(text=short_section_3, link="link4"), + TextSection(text=short_section_4, link="link5"), ], ) + indexing_documents = process_image_sections([document]) chunker = Chunker( tokenizer=embedder.embedding_model.tokenizer, enable_multipass=False, ) - chunks = chunker.chunk([document]) + chunks = chunker.chunk(indexing_documents) assert len(chunks) == 5 assert short_section_1 in chunks[0].content @@ -67,9 +69,10 @@ def test_chunker_heartbeat( metadata={"tags": ["tag1", "tag2"]}, doc_updated_at=None, sections=[ - Section(text="This is a short section.", link="link1"), + TextSection(text="This is a short section.", link="link1"), ], ) + indexing_documents = process_image_sections([document]) chunker = Chunker( tokenizer=embedder.embedding_model.tokenizer, @@ -77,7 +80,7 @@ def test_chunker_heartbeat( callback=mock_heartbeat, ) - chunks = chunker.chunk([document]) + chunks = chunker.chunk(indexing_documents) assert mock_heartbeat.call_count == 1 assert len(chunks) > 0 diff --git a/backend/tests/unit/onyx/indexing/test_embedder.py b/backend/tests/unit/onyx/indexing/test_embedder.py index 7585f7a1d..d8589ecff 100644 --- a/backend/tests/unit/onyx/indexing/test_embedder.py +++ b/backend/tests/unit/onyx/indexing/test_embedder.py @@ -6,7 +6,7 @@ import pytest from onyx.configs.constants import DocumentSource from onyx.connectors.models import Document -from onyx.connectors.models import Section +from onyx.connectors.models import TextSection from onyx.indexing.embedder import DefaultIndexingEmbedder from onyx.indexing.models import ChunkEmbedding from onyx.indexing.models import DocAwareChunk @@ -45,7 +45,7 @@ def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> N metadata={"tags": ["tag1", "tag2"]}, doc_updated_at=None, sections=[ - Section(text="This is a short section.", link="link1"), + TextSection(text="This is a short section.", link="link1"), ], ) chunks: list[DocAwareChunk] = [ diff --git a/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py b/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py index 908111e72..1a4ab701d 100644 --- a/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py +++ b/backend/tests/unit/onyx/indexing/test_indexing_pipeline.py @@ -1,9 +1,11 @@ +from typing import cast from typing import List from onyx.configs.app_configs import MAX_DOCUMENT_CHARS from onyx.connectors.models import Document from onyx.connectors.models import DocumentSource -from onyx.connectors.models import Section +from onyx.connectors.models import ImageSection +from onyx.connectors.models import TextSection from onyx.indexing.indexing_pipeline import filter_documents @@ -11,15 +13,15 @@ def create_test_document( doc_id: str = "test_id", title: str | None = "Test Title", semantic_id: str = "test_semantic_id", - sections: List[Section] | None = None, + sections: List[TextSection] | None = None, ) -> Document: if sections is None: - sections = [Section(text="Test content", link="test_link")] + sections = [TextSection(text="Test content", link="test_link")] return Document( id=doc_id, title=title, semantic_identifier=semantic_id, - sections=sections, + sections=cast(list[TextSection | ImageSection], sections), source=DocumentSource.FILE, metadata={}, ) @@ -27,7 +29,7 @@ def create_test_document( def test_filter_documents_empty_title_and_content() -> None: doc = create_test_document( - title="", semantic_id="", sections=[Section(text="", link="test_link")] + title="", semantic_id="", sections=[TextSection(text="", link="test_link")] ) result = filter_documents([doc]) assert len(result) == 0 @@ -35,7 +37,7 @@ def test_filter_documents_empty_title_and_content() -> None: def test_filter_documents_empty_title_with_content() -> None: doc = create_test_document( - title="", sections=[Section(text="Valid content", link="test_link")] + title="", sections=[TextSection(text="Valid content", link="test_link")] ) result = filter_documents([doc]) assert len(result) == 1 @@ -44,7 +46,7 @@ def test_filter_documents_empty_title_with_content() -> None: def test_filter_documents_empty_content_with_title() -> None: doc = create_test_document( - title="Valid Title", sections=[Section(text="", link="test_link")] + title="Valid Title", sections=[TextSection(text="", link="test_link")] ) result = filter_documents([doc]) assert len(result) == 1 @@ -55,14 +57,15 @@ def test_filter_documents_exceeding_max_chars() -> None: if not MAX_DOCUMENT_CHARS: # Skip if no max chars configured return long_text = "a" * (MAX_DOCUMENT_CHARS + 1) - doc = create_test_document(sections=[Section(text=long_text, link="test_link")]) + doc = create_test_document(sections=[TextSection(text=long_text, link="test_link")]) result = filter_documents([doc]) assert len(result) == 0 def test_filter_documents_valid_document() -> None: doc = create_test_document( - title="Valid Title", sections=[Section(text="Valid content", link="test_link")] + title="Valid Title", + sections=[TextSection(text="Valid content", link="test_link")], ) result = filter_documents([doc]) assert len(result) == 1 @@ -72,7 +75,9 @@ def test_filter_documents_valid_document() -> None: def test_filter_documents_whitespace_only() -> None: doc = create_test_document( - title=" ", semantic_id=" ", sections=[Section(text=" ", link="test_link")] + title=" ", + semantic_id=" ", + sections=[TextSection(text=" ", link="test_link")], ) result = filter_documents([doc]) assert len(result) == 0 @@ -82,7 +87,7 @@ def test_filter_documents_semantic_id_no_title() -> None: doc = create_test_document( title=None, semantic_id="Valid Semantic ID", - sections=[Section(text="Valid content", link="test_link")], + sections=[TextSection(text="Valid content", link="test_link")], ) result = filter_documents([doc]) assert len(result) == 1 @@ -92,9 +97,9 @@ def test_filter_documents_semantic_id_no_title() -> None: def test_filter_documents_multiple_sections() -> None: doc = create_test_document( sections=[ - Section(text="Content 1", link="test_link"), - Section(text="Content 2", link="test_link"), - Section(text="Content 3", link="test_link"), + TextSection(text="Content 1", link="test_link"), + TextSection(text="Content 2", link="test_link"), + TextSection(text="Content 3", link="test_link"), ] ) result = filter_documents([doc]) @@ -106,7 +111,7 @@ def test_filter_documents_multiple_documents() -> None: docs = [ create_test_document(doc_id="1", title="Title 1"), create_test_document( - doc_id="2", title="", sections=[Section(text="", link="test_link")] + doc_id="2", title="", sections=[TextSection(text="", link="test_link")] ), # Should be filtered create_test_document(doc_id="3", title="Title 3"), ] diff --git a/web/admin2_auth.json b/web/admin2_auth.json index cfd81313b..ae36f884b 100644 --- a/web/admin2_auth.json +++ b/web/admin2_auth.json @@ -12,4 +12,4 @@ } ], "origins": [] -} \ No newline at end of file +}