mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-17 13:22:42 +01:00
Separate out indexing-time image analysis into new phase (#4228)
* Separate out indexing-time image analysis into new phase * looking good * k * k
This commit is contained in:
parent
5883336d5e
commit
f87e559cc4
@ -28,6 +28,7 @@ from onyx.connectors.models import ConnectorCheckpoint
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import IndexAttemptMetadata
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.connector_credential_pair import get_last_successful_attempt_time
|
||||
from onyx.db.connector_credential_pair import update_connector_credential_pair
|
||||
@ -154,14 +155,12 @@ def strip_null_characters(doc_batch: list[Document]) -> list[Document]:
|
||||
)
|
||||
|
||||
for section in cleaned_doc.sections:
|
||||
if section.link and "\x00" in section.link:
|
||||
logger.warning(
|
||||
f"NUL characters found in document link for document: {cleaned_doc.id}"
|
||||
)
|
||||
if section.link is not None:
|
||||
section.link = section.link.replace("\x00", "")
|
||||
|
||||
# since text can be longer, just replace to avoid double scan
|
||||
section.text = section.text.replace("\x00", "")
|
||||
if isinstance(section, TextSection) and section.text is not None:
|
||||
section.text = section.text.replace("\x00", "")
|
||||
|
||||
cleaned_batch.append(cleaned_doc)
|
||||
|
||||
@ -479,7 +478,11 @@ def _run_indexing(
|
||||
|
||||
doc_size = 0
|
||||
for section in doc.sections:
|
||||
doc_size += len(section.text)
|
||||
if (
|
||||
isinstance(section, TextSection)
|
||||
and section.text is not None
|
||||
):
|
||||
doc_size += len(section.text)
|
||||
|
||||
if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
|
||||
logger.warning(
|
||||
|
@ -4,6 +4,7 @@ from concurrent.futures import Future
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests
|
||||
from pyairtable import Api as AirtableApi
|
||||
@ -16,7 +17,8 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
from onyx.utils.logger import setup_logger
|
||||
@ -267,7 +269,7 @@ class AirtableConnector(LoadConnector):
|
||||
table_id: str,
|
||||
view_id: str | None,
|
||||
record_id: str,
|
||||
) -> tuple[list[Section], dict[str, str | list[str]]]:
|
||||
) -> tuple[list[TextSection], dict[str, str | list[str]]]:
|
||||
"""
|
||||
Process a single Airtable field and return sections or metadata.
|
||||
|
||||
@ -305,7 +307,7 @@ class AirtableConnector(LoadConnector):
|
||||
|
||||
# Otherwise, create relevant sections
|
||||
sections = [
|
||||
Section(
|
||||
TextSection(
|
||||
link=link,
|
||||
text=(
|
||||
f"{field_name}:\n"
|
||||
@ -340,7 +342,7 @@ class AirtableConnector(LoadConnector):
|
||||
table_name = table_schema.name
|
||||
record_id = record["id"]
|
||||
fields = record["fields"]
|
||||
sections: list[Section] = []
|
||||
sections: list[TextSection] = []
|
||||
metadata: dict[str, str | list[str]] = {}
|
||||
|
||||
# Get primary field value if it exists
|
||||
@ -384,7 +386,7 @@ class AirtableConnector(LoadConnector):
|
||||
|
||||
return Document(
|
||||
id=f"airtable__{record_id}",
|
||||
sections=sections,
|
||||
sections=(cast(list[TextSection | ImageSection], sections)),
|
||||
source=DocumentSource.AIRTABLE,
|
||||
semantic_identifier=semantic_id,
|
||||
metadata=metadata,
|
||||
|
@ -10,7 +10,7 @@ from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -82,7 +82,7 @@ class AsanaConnector(LoadConnector, PollConnector):
|
||||
logger.debug(f"Converting Asana task {task.id} to Document")
|
||||
return Document(
|
||||
id=task.id,
|
||||
sections=[Section(link=task.link, text=task.text)],
|
||||
sections=[TextSection(link=task.link, text=task.text)],
|
||||
doc_updated_at=task.last_modified,
|
||||
source=DocumentSource.ASANA,
|
||||
semantic_identifier=task.title,
|
||||
|
@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
@ -221,7 +221,7 @@ def _get_forums(
|
||||
def _translate_forum_to_doc(af: AxeroForum) -> Document:
|
||||
doc = Document(
|
||||
id=af.doc_id,
|
||||
sections=[Section(link=af.link, text=reply) for reply in af.responses],
|
||||
sections=[TextSection(link=af.link, text=reply) for reply in af.responses],
|
||||
source=DocumentSource.AXERO,
|
||||
semantic_identifier=af.title,
|
||||
doc_updated_at=af.last_update,
|
||||
@ -244,7 +244,7 @@ def _translate_content_to_doc(content: dict) -> Document:
|
||||
|
||||
doc = Document(
|
||||
id="AXERO_" + str(content["ContentID"]),
|
||||
sections=[Section(link=content["ContentURL"], text=page_text)],
|
||||
sections=[TextSection(link=content["ContentURL"], text=page_text)],
|
||||
source=DocumentSource.AXERO,
|
||||
semantic_identifier=content["ContentTitle"],
|
||||
doc_updated_at=time_str_to_utc(content["DateUpdated"]),
|
||||
|
@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -208,7 +208,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"{self.bucket_type}:{self.bucket_name}:{obj['Key']}",
|
||||
sections=[Section(link=link, text=text)],
|
||||
sections=[TextSection(link=link, text=text)],
|
||||
source=DocumentSource(self.bucket_type.value),
|
||||
semantic_identifier=name,
|
||||
doc_updated_at=last_modified,
|
||||
@ -341,7 +341,14 @@ if __name__ == "__main__":
|
||||
print("Sections:")
|
||||
for section in doc.sections:
|
||||
print(f" - Link: {section.link}")
|
||||
print(f" - Text: {section.text[:100]}...")
|
||||
if isinstance(section, TextSection) and section.text is not None:
|
||||
print(f" - Text: {section.text[:100]}...")
|
||||
elif (
|
||||
hasattr(section, "image_file_name") and section.image_file_name
|
||||
):
|
||||
print(f" - Image: {section.image_file_name}")
|
||||
else:
|
||||
print("Error: Unknown section type")
|
||||
print("---")
|
||||
break
|
||||
|
||||
|
@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
|
||||
|
||||
@ -81,7 +81,7 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
)
|
||||
return Document(
|
||||
id="book__" + str(book.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
sections=[TextSection(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Book: " + title,
|
||||
title=title,
|
||||
@ -110,7 +110,7 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
)
|
||||
return Document(
|
||||
id="chapter__" + str(chapter.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
sections=[TextSection(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Chapter: " + title,
|
||||
title=title,
|
||||
@ -134,7 +134,7 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
)
|
||||
return Document(
|
||||
id="shelf:" + str(shelf.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
sections=[TextSection(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Shelf: " + title,
|
||||
title=title,
|
||||
@ -167,7 +167,7 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
time.sleep(0.1)
|
||||
return Document(
|
||||
id="page:" + page_id,
|
||||
sections=[Section(link=url, text=text)],
|
||||
sections=[TextSection(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Page: " + str(title),
|
||||
title=str(title),
|
||||
|
@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
@ -62,11 +62,11 @@ class ClickupConnector(LoadConnector, PollConnector):
|
||||
|
||||
return response.json()
|
||||
|
||||
def _get_task_comments(self, task_id: str) -> list[Section]:
|
||||
def _get_task_comments(self, task_id: str) -> list[TextSection]:
|
||||
url_endpoint = f"/task/{task_id}/comment"
|
||||
response = self._make_request(url_endpoint)
|
||||
comments = [
|
||||
Section(
|
||||
TextSection(
|
||||
link=f'https://app.clickup.com/t/{task_id}?comment={comment_dict["id"]}',
|
||||
text=comment_dict["comment_text"],
|
||||
)
|
||||
@ -133,7 +133,7 @@ class ClickupConnector(LoadConnector, PollConnector):
|
||||
],
|
||||
title=task["name"],
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=task["url"],
|
||||
text=(
|
||||
task["markdown_description"]
|
||||
|
@ -33,9 +33,9 @@ from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -85,7 +85,6 @@ class ConfluenceConnector(
|
||||
PollConnector,
|
||||
SlimConnector,
|
||||
CredentialsConnector,
|
||||
VisionEnabledConnector,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
@ -116,9 +115,6 @@ class ConfluenceConnector(
|
||||
self._confluence_client: OnyxConfluence | None = None
|
||||
self._fetched_titles: set[str] = set()
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
|
||||
# Remove trailing slash from wiki_base if present
|
||||
self.wiki_base = wiki_base.rstrip("/")
|
||||
"""
|
||||
@ -245,12 +241,16 @@ class ConfluenceConnector(
|
||||
)
|
||||
|
||||
# Create the main section for the page content
|
||||
sections = [Section(text=page_content, link=page_url)]
|
||||
sections: list[TextSection | ImageSection] = [
|
||||
TextSection(text=page_content, link=page_url)
|
||||
]
|
||||
|
||||
# Process comments if available
|
||||
comment_text = self._get_comment_string_for_page_id(page_id)
|
||||
if comment_text:
|
||||
sections.append(Section(text=comment_text, link=f"{page_url}#comments"))
|
||||
sections.append(
|
||||
TextSection(text=comment_text, link=f"{page_url}#comments")
|
||||
)
|
||||
|
||||
# Process attachments
|
||||
if "children" in page and "attachment" in page["children"]:
|
||||
@ -264,21 +264,26 @@ class ConfluenceConnector(
|
||||
self.confluence_client,
|
||||
attachment,
|
||||
page_id,
|
||||
page_title,
|
||||
self.image_analysis_llm,
|
||||
)
|
||||
|
||||
if result.text:
|
||||
if result and result.text:
|
||||
# Create a section for the attachment text
|
||||
attachment_section = Section(
|
||||
attachment_section = TextSection(
|
||||
text=result.text,
|
||||
link=f"{page_url}#attachment-{attachment['id']}",
|
||||
)
|
||||
sections.append(attachment_section)
|
||||
elif result and result.file_name:
|
||||
# Create an ImageSection for image attachments
|
||||
image_section = ImageSection(
|
||||
link=f"{page_url}#attachment-{attachment['id']}",
|
||||
image_file_name=result.file_name,
|
||||
)
|
||||
sections.append(attachment_section)
|
||||
elif result.error:
|
||||
sections.append(image_section)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Error processing attachment '{attachment.get('title')}': {result.error}"
|
||||
f"Error processing attachment '{attachment.get('title')}':",
|
||||
f"{result.error if result else 'Unknown error'}",
|
||||
)
|
||||
|
||||
# Extract metadata
|
||||
@ -349,7 +354,7 @@ class ConfluenceConnector(
|
||||
# Now get attachments for that page:
|
||||
attachment_query = self._construct_attachment_query(page["id"])
|
||||
# We'll use the page's XML to provide context if we summarize an image
|
||||
confluence_xml = page.get("body", {}).get("storage", {}).get("value", "")
|
||||
page.get("body", {}).get("storage", {}).get("value", "")
|
||||
|
||||
for attachment in self.confluence_client.paginated_cql_retrieval(
|
||||
cql=attachment_query,
|
||||
@ -357,7 +362,7 @@ class ConfluenceConnector(
|
||||
):
|
||||
attachment["metadata"].get("mediaType", "")
|
||||
if not validate_attachment_filetype(
|
||||
attachment, self.image_analysis_llm
|
||||
attachment,
|
||||
):
|
||||
continue
|
||||
|
||||
@ -368,23 +373,25 @@ class ConfluenceConnector(
|
||||
confluence_client=self.confluence_client,
|
||||
attachment=attachment,
|
||||
page_id=page["id"],
|
||||
page_context=confluence_xml,
|
||||
llm=self.image_analysis_llm,
|
||||
)
|
||||
if response is None:
|
||||
continue
|
||||
|
||||
content_text, file_storage_name = response
|
||||
|
||||
object_url = build_confluence_document_id(
|
||||
self.wiki_base, attachment["_links"]["webui"], self.is_cloud
|
||||
)
|
||||
|
||||
if content_text:
|
||||
doc.sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
text=content_text,
|
||||
link=object_url,
|
||||
)
|
||||
)
|
||||
elif file_storage_name:
|
||||
doc.sections.append(
|
||||
ImageSection(
|
||||
link=object_url,
|
||||
image_file_name=file_storage_name,
|
||||
)
|
||||
)
|
||||
@ -464,7 +471,7 @@ class ConfluenceConnector(
|
||||
# If you skip images, you'll skip them in the permission sync
|
||||
attachment["metadata"].get("mediaType", "")
|
||||
if not validate_attachment_filetype(
|
||||
attachment, self.image_analysis_llm
|
||||
attachment,
|
||||
):
|
||||
continue
|
||||
|
||||
|
@ -36,7 +36,6 @@ from onyx.db.pg_file_store import upsert_pgfilestore
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.file_validation import is_valid_image_type
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -54,17 +53,16 @@ class TokenResponse(BaseModel):
|
||||
|
||||
|
||||
def validate_attachment_filetype(
|
||||
attachment: dict[str, Any], llm: LLM | None = None
|
||||
attachment: dict[str, Any],
|
||||
) -> bool:
|
||||
"""
|
||||
Validates if the attachment is a supported file type.
|
||||
If LLM is provided, also checks if it's an image that can be processed.
|
||||
"""
|
||||
attachment.get("metadata", {})
|
||||
media_type = attachment.get("metadata", {}).get("mediaType", "")
|
||||
|
||||
if media_type.startswith("image/"):
|
||||
return llm is not None and is_valid_image_type(media_type)
|
||||
return is_valid_image_type(media_type)
|
||||
|
||||
# For non-image files, check if we support the extension
|
||||
title = attachment.get("title", "")
|
||||
@ -114,19 +112,16 @@ def process_attachment(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
parent_content_id: str | None,
|
||||
page_context: str,
|
||||
llm: LLM | None,
|
||||
) -> AttachmentProcessingResult:
|
||||
"""
|
||||
Processes a Confluence attachment. If it's a document, extracts text,
|
||||
or if it's an image and an LLM is available, summarizes it. Returns a structured result.
|
||||
or if it's an image, stores it for later analysis. Returns a structured result.
|
||||
"""
|
||||
try:
|
||||
# Get the media type from the attachment metadata
|
||||
media_type = attachment.get("metadata", {}).get("mediaType", "")
|
||||
|
||||
# Validate the attachment type
|
||||
if not validate_attachment_filetype(attachment, llm):
|
||||
if not validate_attachment_filetype(attachment):
|
||||
return AttachmentProcessingResult(
|
||||
text=None,
|
||||
file_name=None,
|
||||
@ -143,7 +138,7 @@ def process_attachment(
|
||||
|
||||
attachment_size = attachment["extensions"]["fileSize"]
|
||||
|
||||
if not media_type.startswith("image/") or not llm:
|
||||
if not media_type.startswith("image/"):
|
||||
if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD:
|
||||
logger.warning(
|
||||
f"Skipping {attachment_link} due to size. "
|
||||
@ -181,10 +176,10 @@ def process_attachment(
|
||||
text=None, file_name=None, error="attachment.content is None"
|
||||
)
|
||||
|
||||
# Process image attachments with LLM if available
|
||||
if media_type.startswith("image/") and llm:
|
||||
# Process image attachments
|
||||
if media_type.startswith("image/"):
|
||||
return _process_image_attachment(
|
||||
confluence_client, attachment, page_context, llm, raw_bytes, media_type
|
||||
confluence_client, attachment, raw_bytes, media_type
|
||||
)
|
||||
|
||||
# Process document attachments
|
||||
@ -217,12 +212,10 @@ def process_attachment(
|
||||
def _process_image_attachment(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
page_context: str,
|
||||
llm: LLM,
|
||||
raw_bytes: bytes,
|
||||
media_type: str,
|
||||
) -> AttachmentProcessingResult:
|
||||
"""Process an image attachment by saving it and generating a summary."""
|
||||
"""Process an image attachment by saving it without generating a summary."""
|
||||
try:
|
||||
# Use the standardized image storage and section creation
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
@ -232,15 +225,14 @@ def _process_image_attachment(
|
||||
file_name=Path(attachment["id"]).name,
|
||||
display_name=attachment["title"],
|
||||
media_type=media_type,
|
||||
llm=llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
logger.info(f"Stored image attachment with file name: {file_name}")
|
||||
|
||||
return AttachmentProcessingResult(
|
||||
text=section.text, file_name=file_name, error=None
|
||||
)
|
||||
# Return empty text but include the file_name for later processing
|
||||
return AttachmentProcessingResult(text="", file_name=file_name, error=None)
|
||||
except Exception as e:
|
||||
msg = f"Image summarization failed for {attachment['title']}: {e}"
|
||||
msg = f"Image storage failed for {attachment['title']}: {e}"
|
||||
logger.error(msg, exc_info=e)
|
||||
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
|
||||
|
||||
@ -302,13 +294,11 @@ def convert_attachment_to_content(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
page_id: str,
|
||||
page_context: str,
|
||||
llm: LLM | None,
|
||||
) -> tuple[str | None, str | None] | None:
|
||||
"""
|
||||
Facade function which:
|
||||
1. Validates attachment type
|
||||
2. Extracts or summarizes content
|
||||
2. Extracts content or stores image for later processing
|
||||
3. Returns (content_text, stored_file_name) or None if we should skip it
|
||||
"""
|
||||
media_type = attachment["metadata"]["mediaType"]
|
||||
@ -319,9 +309,7 @@ def convert_attachment_to_content(
|
||||
)
|
||||
return None
|
||||
|
||||
result = process_attachment(
|
||||
confluence_client, attachment, page_id, page_context, llm
|
||||
)
|
||||
result = process_attachment(confluence_client, attachment, page_id)
|
||||
if result.error is not None:
|
||||
logger.warning(
|
||||
f"Attachment {attachment['title']} encountered error: {result.error}"
|
||||
|
@ -4,6 +4,7 @@ from collections.abc import Iterable
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from discord import Client
|
||||
from discord.channel import TextChannel
|
||||
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -32,7 +34,7 @@ _SNIPPET_LENGTH = 30
|
||||
|
||||
def _convert_message_to_document(
|
||||
message: DiscordMessage,
|
||||
sections: list[Section],
|
||||
sections: list[TextSection],
|
||||
) -> Document:
|
||||
"""
|
||||
Convert a discord message to a document
|
||||
@ -78,7 +80,7 @@ def _convert_message_to_document(
|
||||
semantic_identifier=semantic_identifier,
|
||||
doc_updated_at=message.edited_at,
|
||||
title=title,
|
||||
sections=sections,
|
||||
sections=(cast(list[TextSection | ImageSection], sections)),
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
@ -123,8 +125,8 @@ async def _fetch_documents_from_channel(
|
||||
if channel_message.type != MessageType.default:
|
||||
continue
|
||||
|
||||
sections: list[Section] = [
|
||||
Section(
|
||||
sections: list[TextSection] = [
|
||||
TextSection(
|
||||
text=channel_message.content,
|
||||
link=channel_message.jump_url,
|
||||
)
|
||||
@ -142,7 +144,7 @@ async def _fetch_documents_from_channel(
|
||||
continue
|
||||
|
||||
sections = [
|
||||
Section(
|
||||
TextSection(
|
||||
text=thread_message.content,
|
||||
link=thread_message.jump_url,
|
||||
)
|
||||
@ -160,7 +162,7 @@ async def _fetch_documents_from_channel(
|
||||
continue
|
||||
|
||||
sections = [
|
||||
Section(
|
||||
TextSection(
|
||||
text=thread_message.content,
|
||||
link=thread_message.jump_url,
|
||||
)
|
||||
|
@ -3,6 +3,7 @@ import urllib.parse
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
@ -112,7 +114,7 @@ class DiscourseConnector(PollConnector):
|
||||
responders.append(BasicExpertInfo(display_name=responder_name))
|
||||
|
||||
sections.append(
|
||||
Section(link=topic_url, text=parse_html_page_basic(post["cooked"]))
|
||||
TextSection(link=topic_url, text=parse_html_page_basic(post["cooked"]))
|
||||
)
|
||||
category_name = self.category_id_map.get(topic["category_id"])
|
||||
|
||||
@ -129,7 +131,7 @@ class DiscourseConnector(PollConnector):
|
||||
|
||||
doc = Document(
|
||||
id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]),
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.DISCOURSE,
|
||||
semantic_identifier=topic["title"],
|
||||
doc_updated_at=time_str_to_utc(topic["last_posted_at"]),
|
||||
|
@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
@ -158,7 +158,7 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
|
||||
document = Document(
|
||||
id=article_details["id"],
|
||||
sections=[Section(link=doc_link, text=doc_text)],
|
||||
sections=[TextSection(link=doc_link, text=doc_text)],
|
||||
source=DocumentSource.DOCUMENT360,
|
||||
semantic_identifier=article_details["title"],
|
||||
doc_updated_at=updated_at,
|
||||
|
@ -19,7 +19,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -108,7 +108,7 @@ class DropboxConnector(LoadConnector, PollConnector):
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"doc:{entry.id}",
|
||||
sections=[Section(link=link, text=text)],
|
||||
sections=[TextSection(link=link, text=text)],
|
||||
source=DocumentSource.DROPBOX,
|
||||
semantic_identifier=entry.name,
|
||||
doc_updated_at=modified_time,
|
||||
|
@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import detect_encoding
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
@ -111,7 +111,7 @@ def _process_egnyte_file(
|
||||
# Create the document
|
||||
return Document(
|
||||
id=f"egnyte-{file_metadata['entry_id']}",
|
||||
sections=[Section(text=file_content_raw.strip(), link=web_url)],
|
||||
sections=[TextSection(text=file_content_raw.strip(), link=web_url)],
|
||||
source=DocumentSource.EGNYTE,
|
||||
semantic_identifier=file_name,
|
||||
metadata=metadata,
|
||||
|
@ -16,8 +16,8 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
|
||||
from onyx.file_processing.extract_file_text import extract_text_and_images
|
||||
@ -26,7 +26,6 @@ from onyx.file_processing.extract_file_text import is_valid_file_ext
|
||||
from onyx.file_processing.extract_file_text import load_files_from_zip
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -59,32 +58,44 @@ def _read_files_and_metadata(
|
||||
|
||||
|
||||
def _create_image_section(
|
||||
llm: LLM | None,
|
||||
image_data: bytes,
|
||||
db_session: Session,
|
||||
parent_file_name: str,
|
||||
display_name: str,
|
||||
link: str | None = None,
|
||||
idx: int = 0,
|
||||
) -> tuple[Section, str | None]:
|
||||
) -> tuple[ImageSection, str | None]:
|
||||
"""
|
||||
Create a Section object for a single image and store the image in PGFileStore.
|
||||
If summarization is enabled and we have an LLM, summarize the image.
|
||||
Creates an ImageSection for an image file or embedded image.
|
||||
Stores the image in PGFileStore but does not generate a summary.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes
|
||||
db_session: Database session
|
||||
parent_file_name: Name of the parent file (for embedded images)
|
||||
display_name: Display name for the image
|
||||
idx: Index for embedded images
|
||||
|
||||
Returns:
|
||||
tuple: (Section object, file_name in PGFileStore or None if storage failed)
|
||||
Tuple of (ImageSection, stored_file_name or None)
|
||||
"""
|
||||
# Create a unique file name for the embedded image
|
||||
file_name = f"{parent_file_name}_embedded_{idx}"
|
||||
# Create a unique identifier for the image
|
||||
file_name = f"{parent_file_name}_embedded_{idx}" if idx > 0 else parent_file_name
|
||||
|
||||
# Use the standardized utility to store the image and create a section
|
||||
return store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=image_data,
|
||||
file_name=file_name,
|
||||
display_name=display_name,
|
||||
llm=llm,
|
||||
file_origin=FileOrigin.OTHER,
|
||||
)
|
||||
# Store the image and create a section
|
||||
try:
|
||||
section, stored_file_name = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=image_data,
|
||||
file_name=file_name,
|
||||
display_name=display_name,
|
||||
link=link,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
return section, stored_file_name
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store image {display_name}: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
def _process_file(
|
||||
@ -93,12 +104,16 @@ def _process_file(
|
||||
metadata: dict[str, Any] | None,
|
||||
pdf_pass: str | None,
|
||||
db_session: Session,
|
||||
llm: LLM | None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Processes a single file, returning a list of Documents (typically one).
|
||||
Also handles embedded images if 'EMBEDDED_IMAGE_EXTRACTION_ENABLED' is true.
|
||||
Process a file and return a list of Documents.
|
||||
For images, creates ImageSection objects without summarization.
|
||||
For documents with embedded images, extracts and stores the images.
|
||||
"""
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Get file extension and determine file type
|
||||
extension = get_file_ext(file_name)
|
||||
|
||||
# Fetch the DB record so we know the ID for internal URL
|
||||
@ -114,8 +129,6 @@ def _process_file(
|
||||
return []
|
||||
|
||||
# Prepare doc metadata
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
file_display_name = metadata.get("file_display_name") or os.path.basename(file_name)
|
||||
|
||||
# Timestamps
|
||||
@ -158,6 +171,7 @@ def _process_file(
|
||||
"title",
|
||||
"connector_type",
|
||||
"pdf_password",
|
||||
"mime_type",
|
||||
]
|
||||
}
|
||||
|
||||
@ -170,33 +184,45 @@ def _process_file(
|
||||
title = metadata.get("title") or file_display_name
|
||||
|
||||
# 1) If the file itself is an image, handle that scenario quickly
|
||||
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
|
||||
if extension in IMAGE_EXTENSIONS:
|
||||
# Summarize or produce empty doc
|
||||
if extension in LoadConnector.IMAGE_EXTENSIONS:
|
||||
# Read the image data
|
||||
image_data = file.read()
|
||||
image_section, _ = _create_image_section(
|
||||
llm, image_data, db_session, pg_record.file_name, title
|
||||
)
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
sections=[image_section],
|
||||
source=source_type,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=final_time_updated,
|
||||
primary_owners=p_owners,
|
||||
secondary_owners=s_owners,
|
||||
metadata=metadata_tags,
|
||||
)
|
||||
]
|
||||
if not image_data:
|
||||
logger.warning(f"Empty image file: {file_name}")
|
||||
return []
|
||||
|
||||
# 2) Otherwise: text-based approach. Possibly with embedded images if enabled.
|
||||
# (For example .docx with inline images).
|
||||
# Create an ImageSection for the image
|
||||
try:
|
||||
section, _ = _create_image_section(
|
||||
image_data=image_data,
|
||||
db_session=db_session,
|
||||
parent_file_name=pg_record.file_name,
|
||||
display_name=title,
|
||||
)
|
||||
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
sections=[section],
|
||||
source=source_type,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=final_time_updated,
|
||||
primary_owners=p_owners,
|
||||
secondary_owners=s_owners,
|
||||
metadata=metadata_tags,
|
||||
)
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process image file {file_name}: {e}")
|
||||
return []
|
||||
|
||||
# 2) Otherwise: text-based approach. Possibly with embedded images.
|
||||
file.seek(0)
|
||||
text_content = ""
|
||||
embedded_images: list[tuple[bytes, str]] = []
|
||||
|
||||
# Extract text and images from the file
|
||||
text_content, embedded_images = extract_text_and_images(
|
||||
file=file,
|
||||
file_name=file_name,
|
||||
@ -204,24 +230,29 @@ def _process_file(
|
||||
)
|
||||
|
||||
# Build sections: first the text as a single Section
|
||||
sections = []
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
link_in_meta = metadata.get("link")
|
||||
if text_content.strip():
|
||||
sections.append(Section(link=link_in_meta, text=text_content.strip()))
|
||||
sections.append(TextSection(link=link_in_meta, text=text_content.strip()))
|
||||
|
||||
# Then any extracted images from docx, etc.
|
||||
for idx, (img_data, img_name) in enumerate(embedded_images, start=1):
|
||||
# Store each embedded image as a separate file in PGFileStore
|
||||
# and create a section with the image summary
|
||||
image_section, _ = _create_image_section(
|
||||
llm,
|
||||
img_data,
|
||||
db_session,
|
||||
pg_record.file_name,
|
||||
f"{title} - image {idx}",
|
||||
idx,
|
||||
)
|
||||
sections.append(image_section)
|
||||
# and create a section with the image reference
|
||||
try:
|
||||
image_section, _ = _create_image_section(
|
||||
image_data=img_data,
|
||||
db_session=db_session,
|
||||
parent_file_name=pg_record.file_name,
|
||||
display_name=f"{title} - image {idx}",
|
||||
idx=idx,
|
||||
)
|
||||
sections.append(image_section)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to process embedded image {idx} in {file_name}: {e}"
|
||||
)
|
||||
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
@ -237,10 +268,10 @@ def _process_file(
|
||||
]
|
||||
|
||||
|
||||
class LocalFileConnector(LoadConnector, VisionEnabledConnector):
|
||||
class LocalFileConnector(LoadConnector):
|
||||
"""
|
||||
Connector that reads files from Postgres and yields Documents, including
|
||||
optional embedded image extraction.
|
||||
embedded image extraction without summarization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -252,9 +283,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
|
||||
self.batch_size = batch_size
|
||||
self.pdf_pass: str | None = None
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
self.pdf_pass = credentials.get("pdf_password")
|
||||
|
||||
@ -286,7 +314,6 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
|
||||
metadata=metadata,
|
||||
pdf_pass=self.pdf_pass,
|
||||
db_session=db_session,
|
||||
llm=self.image_analysis_llm,
|
||||
)
|
||||
documents.extend(new_docs)
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import cast
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
@ -14,7 +15,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -45,7 +47,7 @@ _FIREFLIES_API_QUERY = """
|
||||
|
||||
|
||||
def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
sections: List[Section] = []
|
||||
sections: List[TextSection] = []
|
||||
current_speaker_name = None
|
||||
current_link = ""
|
||||
current_text = ""
|
||||
@ -57,7 +59,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
if sentence["speaker_name"] != current_speaker_name:
|
||||
if current_speaker_name is not None:
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
link=current_link,
|
||||
text=current_text.strip(),
|
||||
)
|
||||
@ -71,7 +73,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
|
||||
# Sometimes these links (links with a timestamp) do not work, it is a bug with Fireflies.
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
link=current_link,
|
||||
text=current_text.strip(),
|
||||
)
|
||||
@ -94,7 +96,7 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
|
||||
return Document(
|
||||
id=fireflies_id,
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.FIREFLIES,
|
||||
semantic_identifier=meeting_title,
|
||||
metadata={},
|
||||
|
@ -14,7 +14,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -133,7 +133,7 @@ def _create_doc_from_ticket(ticket: dict, domain: str) -> Document:
|
||||
return Document(
|
||||
id=_FRESHDESK_ID_PREFIX + link,
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=link,
|
||||
text=text,
|
||||
)
|
||||
|
@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -183,7 +183,7 @@ def _convert_page_to_document(
|
||||
return Document(
|
||||
id=f"gitbook-{space_id}-{page_id}",
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=page.get("urls", {}).get("app", ""),
|
||||
text=_extract_text_from_document(page_content),
|
||||
)
|
||||
|
@ -27,7 +27,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.batching import batch_generator
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -87,7 +87,9 @@ def _batch_github_objects(
|
||||
def _convert_pr_to_document(pull_request: PullRequest) -> Document:
|
||||
return Document(
|
||||
id=pull_request.html_url,
|
||||
sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
|
||||
sections=[
|
||||
TextSection(link=pull_request.html_url, text=pull_request.body or "")
|
||||
],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=pull_request.title,
|
||||
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||
@ -109,7 +111,7 @@ def _fetch_issue_comments(issue: Issue) -> str:
|
||||
def _convert_issue_to_document(issue: Issue) -> Document:
|
||||
return Document(
|
||||
id=issue.html_url,
|
||||
sections=[Section(link=issue.html_url, text=issue.body or "")],
|
||||
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=issue.title,
|
||||
# updated_at is UTC time but is timezone unaware
|
||||
|
@ -21,7 +21,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ def get_author(author: Any) -> BasicExpertInfo:
|
||||
def _convert_merge_request_to_document(mr: Any) -> Document:
|
||||
doc = Document(
|
||||
id=mr.web_url,
|
||||
sections=[Section(link=mr.web_url, text=mr.description or "")],
|
||||
sections=[TextSection(link=mr.web_url, text=mr.description or "")],
|
||||
source=DocumentSource.GITLAB,
|
||||
semantic_identifier=mr.title,
|
||||
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||
@ -72,7 +72,7 @@ def _convert_merge_request_to_document(mr: Any) -> Document:
|
||||
def _convert_issue_to_document(issue: Any) -> Document:
|
||||
doc = Document(
|
||||
id=issue.web_url,
|
||||
sections=[Section(link=issue.web_url, text=issue.description or "")],
|
||||
sections=[TextSection(link=issue.web_url, text=issue.description or "")],
|
||||
source=DocumentSource.GITLAB,
|
||||
semantic_identifier=issue.title,
|
||||
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||
@ -99,7 +99,7 @@ def _convert_code_to_document(
|
||||
file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL
|
||||
doc = Document(
|
||||
id=file["id"],
|
||||
sections=[Section(link=file_url, text=file_content)],
|
||||
sections=[TextSection(link=file_url, text=file_content)],
|
||||
source=DocumentSource.GITLAB,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=datetime.now().replace(
|
||||
|
@ -1,5 +1,6 @@
|
||||
from base64 import urlsafe_b64decode
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
@ -28,8 +29,9 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
@ -115,7 +117,7 @@ def _get_message_body(payload: dict[str, Any]) -> str:
|
||||
return message_body
|
||||
|
||||
|
||||
def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]]:
|
||||
def message_to_section(message: Dict[str, Any]) -> tuple[TextSection, dict[str, str]]:
|
||||
link = f"https://mail.google.com/mail/u/0/#inbox/{message['id']}"
|
||||
|
||||
payload = message.get("payload", {})
|
||||
@ -142,7 +144,7 @@ def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]
|
||||
|
||||
message_body_text: str = _get_message_body(payload)
|
||||
|
||||
return Section(link=link, text=message_body_text + message_data), metadata
|
||||
return TextSection(link=link, text=message_body_text + message_data), metadata
|
||||
|
||||
|
||||
def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
|
||||
@ -192,7 +194,7 @@ def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
|
||||
return Document(
|
||||
id=id,
|
||||
semantic_identifier=semantic_identifier,
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.GMAIL,
|
||||
# This is used to perform permission sync
|
||||
primary_owners=primary_owners,
|
||||
|
@ -18,7 +18,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -243,7 +243,7 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
Document(
|
||||
id=call_id,
|
||||
sections=[
|
||||
Section(link=call_metadata["url"], text=transcript_text)
|
||||
TextSection(link=call_metadata["url"], text=transcript_text)
|
||||
],
|
||||
source=DocumentSource.GONG,
|
||||
# Should not ever be Untitled as a call cannot be made without a Title
|
||||
|
@ -43,9 +43,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
@ -68,7 +66,6 @@ def _convert_single_file(
|
||||
creds: Any,
|
||||
primary_admin_email: str,
|
||||
file: dict[str, Any],
|
||||
image_analysis_llm: LLM | None,
|
||||
) -> Any:
|
||||
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||
user_drive_service = get_drive_service(creds, user_email=user_email)
|
||||
@ -77,7 +74,6 @@ def _convert_single_file(
|
||||
file=file,
|
||||
drive_service=user_drive_service,
|
||||
docs_service=docs_service,
|
||||
image_analysis_llm=image_analysis_llm, # pass the LLM so doc_conversion can summarize images
|
||||
)
|
||||
|
||||
|
||||
@ -116,9 +112,7 @@ def _clean_requested_drive_ids(
|
||||
return valid_requested_drive_ids, filtered_folder_ids
|
||||
|
||||
|
||||
class GoogleDriveConnector(
|
||||
LoadConnector, PollConnector, SlimConnector, VisionEnabledConnector
|
||||
):
|
||||
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
def __init__(
|
||||
self,
|
||||
include_shared_drives: bool = False,
|
||||
@ -151,9 +145,6 @@ class GoogleDriveConnector(
|
||||
if continue_on_failure is not None:
|
||||
logger.warning("The 'continue_on_failure' parameter is deprecated.")
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
|
||||
if (
|
||||
not include_shared_drives
|
||||
and not include_my_drives
|
||||
@ -539,7 +530,6 @@ class GoogleDriveConnector(
|
||||
_convert_single_file,
|
||||
self.creds,
|
||||
self.primary_admin_email,
|
||||
image_analysis_llm=self.image_analysis_llm, # Use the mixin's LLM
|
||||
)
|
||||
|
||||
# Fetch files in batches
|
||||
|
@ -1,40 +1,50 @@
|
||||
import io
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import cast
|
||||
|
||||
import openpyxl # type: ignore
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from googleapiclient.http import MediaIoBaseDownload # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
|
||||
from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
|
||||
from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
from onyx.connectors.google_drive.models import GDriveMimeType
|
||||
from onyx.connectors.google_drive.models import GoogleDriveFileType
|
||||
from onyx.connectors.google_drive.section_extraction import get_document_sections
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.file_processing.extract_file_text import docx_to_text_and_images
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import pptx_to_text
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
from onyx.file_processing.file_validation import is_valid_image_type
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import unstructured_to_text
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Mapping of Google Drive mime types to export formats
|
||||
GOOGLE_MIME_TYPES_TO_EXPORT = {
|
||||
GDriveMimeType.DOC.value: "text/plain",
|
||||
GDriveMimeType.SPREADSHEET.value: "text/csv",
|
||||
GDriveMimeType.PPT.value: "text/plain",
|
||||
}
|
||||
|
||||
# Define Google MIME types mapping
|
||||
GOOGLE_MIME_TYPES = {
|
||||
GDriveMimeType.DOC.value: "text/plain",
|
||||
GDriveMimeType.SPREADSHEET.value: "text/csv",
|
||||
GDriveMimeType.PPT.value: "text/plain",
|
||||
}
|
||||
|
||||
|
||||
def _summarize_drive_image(
|
||||
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
|
||||
@ -66,259 +76,137 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
|
||||
def _extract_sections_basic(
|
||||
file: dict[str, str],
|
||||
service: GoogleDriveService,
|
||||
image_analysis_llm: LLM | None = None,
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extends the existing logic to handle either a docx with embedded images
|
||||
or standalone images (PNG, JPG, etc).
|
||||
"""
|
||||
) -> list[TextSection | ImageSection]:
|
||||
"""Extract text and images from a Google Drive file."""
|
||||
file_id = file["id"]
|
||||
file_name = file["name"]
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
file_name = file.get("name", file["id"])
|
||||
supported_file_types = set(item.value for item in GDriveMimeType)
|
||||
|
||||
# 1) If the file is an image, retrieve the raw bytes, optionally summarize
|
||||
if is_gdrive_image_mime_type(mime_type):
|
||||
try:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
section, _ = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=response,
|
||||
file_name=file["id"],
|
||||
display_name=file_name,
|
||||
media_type=mime_type,
|
||||
llm=image_analysis_llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
return [section]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch or summarize image: {e}")
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text="",
|
||||
image_file_name=link,
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type not in supported_file_types:
|
||||
# Unsupported file types can still have a title, finding this way is still useful
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
link = file.get("webViewLink", "")
|
||||
|
||||
try:
|
||||
# ---------------------------
|
||||
# Google Sheets extraction
|
||||
if mime_type == GDriveMimeType.SPREADSHEET.value:
|
||||
# For Google Docs, Sheets, and Slides, export as plain text
|
||||
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
|
||||
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
|
||||
# Use the correct API call for exporting files
|
||||
request = service.files().export_media(
|
||||
fileId=file_id, mimeType=export_mime_type
|
||||
)
|
||||
response_bytes = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(response_bytes, request)
|
||||
done = False
|
||||
while not done:
|
||||
_, done = downloader.next_chunk()
|
||||
|
||||
response = response_bytes.getvalue()
|
||||
if not response:
|
||||
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
|
||||
return []
|
||||
|
||||
text = response.decode("utf-8")
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
# For other file types, download the file
|
||||
# Use the correct API call for downloading files
|
||||
request = service.files().get_media(fileId=file_id)
|
||||
response_bytes = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(response_bytes, request)
|
||||
done = False
|
||||
while not done:
|
||||
_, done = downloader.next_chunk()
|
||||
|
||||
response = response_bytes.getvalue()
|
||||
if not response:
|
||||
logger.warning(f"Failed to download {file_name}")
|
||||
return []
|
||||
|
||||
# Process based on mime type
|
||||
if mime_type == "text/plain":
|
||||
text = response.decode("utf-8")
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
elif (
|
||||
mime_type
|
||||
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
text, _ = docx_to_text_and_images(io.BytesIO(response))
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
elif (
|
||||
mime_type
|
||||
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
):
|
||||
text = xlsx_to_text(io.BytesIO(response))
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
elif (
|
||||
mime_type
|
||||
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
):
|
||||
text = pptx_to_text(io.BytesIO(response))
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
elif is_gdrive_image_mime_type(mime_type):
|
||||
# For images, store them for later processing
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
try:
|
||||
sheets_service = build(
|
||||
"sheets", "v4", credentials=service._http.credentials
|
||||
)
|
||||
spreadsheet = (
|
||||
sheets_service.spreadsheets()
|
||||
.get(spreadsheetId=file["id"])
|
||||
.execute()
|
||||
)
|
||||
|
||||
sections = []
|
||||
for sheet in spreadsheet["sheets"]:
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
sheet_id = sheet["properties"]["sheetId"]
|
||||
|
||||
# Get sheet dimensions
|
||||
grid_properties = sheet["properties"].get("gridProperties", {})
|
||||
row_count = grid_properties.get("rowCount", 1000)
|
||||
column_count = grid_properties.get("columnCount", 26)
|
||||
|
||||
# Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
|
||||
end_column = ""
|
||||
while column_count:
|
||||
column_count, remainder = divmod(column_count - 1, 26)
|
||||
end_column = chr(65 + remainder) + end_column
|
||||
|
||||
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
|
||||
|
||||
try:
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=file["id"], range=range_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
if values:
|
||||
text = f"Sheet: {sheet_name}\n"
|
||||
for row in values:
|
||||
text += "\t".join(str(cell) for cell in row) + "\n"
|
||||
sections.append(
|
||||
Section(
|
||||
link=f"{link}#gid={sheet_id}",
|
||||
text=text,
|
||||
)
|
||||
)
|
||||
except HttpError as e:
|
||||
logger.warning(
|
||||
f"Error fetching data for sheet '{sheet_name}': {e}"
|
||||
)
|
||||
continue
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
# ---------------------------
|
||||
# Microsoft Excel (.xlsx or .xls) extraction branch
|
||||
elif mime_type in [
|
||||
GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value,
|
||||
GDriveMimeType.SPREADSHEET_MS_EXCEL.value,
|
||||
]:
|
||||
try:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
|
||||
with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp:
|
||||
tmp.write(response)
|
||||
tmp_path = tmp.name
|
||||
|
||||
section_separator = "\n\n"
|
||||
workbook = openpyxl.load_workbook(tmp_path, read_only=True)
|
||||
|
||||
# Work similarly to the xlsx_to_text function used for file connector
|
||||
# but returns Sections instead of a string
|
||||
sections = [
|
||||
Section(
|
||||
link=link,
|
||||
text=(
|
||||
f"Sheet: {sheet.title}\n\n"
|
||||
+ section_separator.join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(
|
||||
min_row=1, values_only=True
|
||||
)
|
||||
if row
|
||||
)
|
||||
),
|
||||
)
|
||||
for sheet in workbook.worksheets
|
||||
]
|
||||
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error extracting data from Excel file '{file['name']}': {e}"
|
||||
)
|
||||
return [
|
||||
Section(link=link, text="Error extracting data from Excel file")
|
||||
]
|
||||
|
||||
# ---------------------------
|
||||
# Export for Google Docs, PPT, and fallback for spreadsheets
|
||||
if mime_type in [
|
||||
GDriveMimeType.DOC.value,
|
||||
GDriveMimeType.PPT.value,
|
||||
GDriveMimeType.SPREADSHEET.value,
|
||||
]:
|
||||
export_mime_type = (
|
||||
"text/plain"
|
||||
if mime_type != GDriveMimeType.SPREADSHEET.value
|
||||
else "text/csv"
|
||||
)
|
||||
text = (
|
||||
service.files()
|
||||
.export(fileId=file["id"], mimeType=export_mime_type)
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
# ---------------------------
|
||||
# Plain text and Markdown files
|
||||
elif mime_type in [
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
text_data = (
|
||||
service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text_data)]
|
||||
|
||||
# ---------------------------
|
||||
# Word, PowerPoint, PDF files
|
||||
elif mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response_bytes = service.files().get_media(fileId=file["id"]).execute()
|
||||
|
||||
# Optionally use Unstructured
|
||||
if get_unstructured_api_key():
|
||||
text = unstructured_to_text(
|
||||
file=io.BytesIO(response_bytes),
|
||||
file_name=file_name,
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
# Use docx_to_text_and_images to get text plus embedded images
|
||||
text, embedded_images = docx_to_text_and_images(
|
||||
file=io.BytesIO(response_bytes),
|
||||
)
|
||||
sections = []
|
||||
if text.strip():
|
||||
sections.append(Section(link=link, text=text.strip()))
|
||||
|
||||
# Process each embedded image using the standardized function
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for idx, (img_data, img_name) in enumerate(
|
||||
embedded_images, start=1
|
||||
):
|
||||
# Create a unique identifier for the embedded image
|
||||
embedded_id = f"{file['id']}_embedded_{idx}"
|
||||
section, embedded_id = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=response,
|
||||
file_name=file_id,
|
||||
display_name=file_name,
|
||||
media_type=mime_type,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
link=link,
|
||||
)
|
||||
sections.append(section)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process image {file_name}: {e}")
|
||||
return sections
|
||||
|
||||
section, _ = store_image_and_create_section(
|
||||
elif mime_type == "application/pdf":
|
||||
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response))
|
||||
pdf_sections: list[TextSection | ImageSection] = [
|
||||
TextSection(link=link, text=text)
|
||||
]
|
||||
|
||||
# Process embedded images in the PDF
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for idx, (img_data, img_name) in enumerate(images):
|
||||
section, embedded_id = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=img_data,
|
||||
file_name=embedded_id,
|
||||
file_name=f"{file_id}_img_{idx}",
|
||||
display_name=img_name or f"{file_name} - image {idx}",
|
||||
llm=image_analysis_llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
sections.append(section)
|
||||
return sections
|
||||
pdf_sections.append(section)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process PDF images in {file_name}: {e}")
|
||||
return pdf_sections
|
||||
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_bytes))
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
text_data = pptx_to_text(io.BytesIO(response_bytes))
|
||||
return [Section(link=link, text=text_data)]
|
||||
|
||||
# Catch-all case, should not happen since there should be specific handling
|
||||
# for each of the supported file types
|
||||
error_message = f"Unsupported file type: {mime_type}"
|
||||
logger.error(error_message)
|
||||
raise ValueError(error_message)
|
||||
else:
|
||||
# For unsupported file types, try to extract text
|
||||
try:
|
||||
text = extract_file_text(io.BytesIO(response), file_name)
|
||||
return [TextSection(link=link, text=text)]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract text from {file_name}: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error extracting sections from file: {e}")
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
logger.error(f"Error processing file {file_name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: GoogleDriveService,
|
||||
docs_service: GoogleDocsService,
|
||||
image_analysis_llm: LLM | None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Main entry point for converting a Google Drive file => Document object.
|
||||
Now we accept an optional `llm` to pass to `_extract_sections_basic`.
|
||||
"""
|
||||
try:
|
||||
# skip shortcuts or folders
|
||||
@ -327,44 +215,50 @@ def convert_drive_item_to_document(
|
||||
return None
|
||||
|
||||
# If it's a Google Doc, we might do advanced parsing
|
||||
sections: list[Section] = []
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
|
||||
# Try to get sections using the advanced method first
|
||||
if file.get("mimeType") == GDriveMimeType.DOC.value:
|
||||
try:
|
||||
# get_document_sections is the advanced approach for Google Docs
|
||||
sections = get_document_sections(docs_service, file["id"])
|
||||
doc_sections = get_document_sections(
|
||||
docs_service=docs_service, doc_id=file.get("id", "")
|
||||
)
|
||||
if doc_sections:
|
||||
sections = cast(list[TextSection | ImageSection], doc_sections)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to pull google doc sections from '{file['name']}': {e}. "
|
||||
"Falling back to basic extraction."
|
||||
f"Error in advanced parsing: {e}. Falling back to basic extraction."
|
||||
)
|
||||
|
||||
# If not a doc, or if we failed above, do our 'basic' approach
|
||||
# If we don't have sections yet, use the basic extraction method
|
||||
if not sections:
|
||||
sections = _extract_sections_basic(file, drive_service, image_analysis_llm)
|
||||
sections = _extract_sections_basic(file, drive_service)
|
||||
|
||||
# If we still don't have any sections, skip this file
|
||||
if not sections:
|
||||
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
|
||||
return None
|
||||
|
||||
doc_id = file["webViewLink"]
|
||||
updated_time = datetime.fromisoformat(file["modifiedTime"]).astimezone(
|
||||
timezone.utc
|
||||
)
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
id=doc_id,
|
||||
sections=sections,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=updated_time,
|
||||
metadata={}, # or any metadata from 'file'
|
||||
additional_info=file.get("id"),
|
||||
semantic_identifier=file.get("name", ""),
|
||||
metadata={
|
||||
"owner_names": ", ".join(
|
||||
owner.get("displayName", "") for owner in file.get("owners", [])
|
||||
),
|
||||
},
|
||||
doc_updated_at=datetime.fromisoformat(
|
||||
file.get("modifiedTime", "").replace("Z", "+00:00")
|
||||
),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error converting file '{file.get('name')}' to Document: {e}")
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise
|
||||
return None
|
||||
logger.error(f"Error converting file {file.get('name')}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
|
@ -3,7 +3,7 @@ from typing import Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
|
||||
class CurrentHeading(BaseModel):
|
||||
@ -37,7 +37,7 @@ def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
|
||||
def get_document_sections(
|
||||
docs_service: GoogleDocsService,
|
||||
doc_id: str,
|
||||
) -> list[Section]:
|
||||
) -> list[TextSection]:
|
||||
"""Extracts sections from a Google Doc, including their headings and content"""
|
||||
# Fetch the document structure
|
||||
doc = docs_service.documents().get(documentId=doc_id).execute()
|
||||
@ -45,7 +45,7 @@ def get_document_sections(
|
||||
# Get the content
|
||||
content = doc.get("body", {}).get("content", [])
|
||||
|
||||
sections: list[Section] = []
|
||||
sections: list[TextSection] = []
|
||||
current_section: list[str] = []
|
||||
current_heading: CurrentHeading | None = None
|
||||
|
||||
@ -70,7 +70,7 @@ def get_document_sections(
|
||||
heading_text = current_heading.text
|
||||
section_text = f"{heading_text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
@ -96,7 +96,7 @@ def get_document_sections(
|
||||
if current_heading is not None and current_section:
|
||||
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
|
@ -12,7 +12,7 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.engine import get_sqlalchemy_engine
|
||||
from onyx.file_processing.extract_file_text import load_files_from_zip
|
||||
from onyx.file_processing.extract_file_text import read_text_file
|
||||
@ -118,7 +118,7 @@ class GoogleSitesConnector(LoadConnector):
|
||||
source=DocumentSource.GOOGLE_SITES,
|
||||
semantic_identifier=title,
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=(self.base_url.rstrip("/") + "/" + path.lstrip("/"))
|
||||
if path
|
||||
else "",
|
||||
|
@ -15,7 +15,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -120,7 +120,7 @@ class GuruConnector(LoadConnector, PollConnector):
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=card["id"],
|
||||
sections=[Section(link=link, text=content_text)],
|
||||
sections=[TextSection(link=link, text=content_text)],
|
||||
source=DocumentSource.GURU,
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=latest_time,
|
||||
|
@ -13,7 +13,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
HUBSPOT_BASE_URL = "https://app.hubspot.com/contacts/"
|
||||
@ -108,7 +108,7 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=ticket.id,
|
||||
sections=[Section(link=link, text=content_text)],
|
||||
sections=[TextSection(link=link, text=content_text)],
|
||||
source=DocumentSource.HUBSPOT,
|
||||
semantic_identifier=title,
|
||||
# Is already in tzutc, just replacing the timezone format
|
||||
|
@ -24,6 +24,8 @@ CheckpointOutput = Generator[Document | ConnectorFailure, None, ConnectorCheckpo
|
||||
|
||||
class BaseConnector(abc.ABC):
|
||||
REDIS_KEY_PREFIX = "da_connector_data:"
|
||||
# Common image file extensions supported across connectors
|
||||
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
||||
|
||||
@abc.abstractmethod
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
|
@ -21,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import request_with_retries
|
||||
|
||||
@ -237,22 +238,30 @@ class LinearConnector(LoadConnector, PollConnector, OAuthConnector):
|
||||
documents: list[Document] = []
|
||||
for edge in edges:
|
||||
node = edge["node"]
|
||||
# Create sections for description and comments
|
||||
sections = [
|
||||
TextSection(
|
||||
link=node["url"],
|
||||
text=node["description"] or "",
|
||||
)
|
||||
]
|
||||
|
||||
# Add comment sections
|
||||
for comment in node["comments"]["nodes"]:
|
||||
sections.append(
|
||||
TextSection(
|
||||
link=node["url"],
|
||||
text=comment["body"] or "",
|
||||
)
|
||||
)
|
||||
|
||||
# Cast the sections list to the expected type
|
||||
typed_sections = cast(list[TextSection | ImageSection], sections)
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
id=node["id"],
|
||||
sections=[
|
||||
Section(
|
||||
link=node["url"],
|
||||
text=node["description"] or "",
|
||||
)
|
||||
]
|
||||
+ [
|
||||
Section(
|
||||
link=node["url"],
|
||||
text=comment["body"] or "",
|
||||
)
|
||||
for comment in node["comments"]["nodes"]
|
||||
],
|
||||
sections=typed_sections,
|
||||
source=DocumentSource.LINEAR,
|
||||
semantic_identifier=f"[{node['identifier']}] {node['title']}",
|
||||
title=node["title"],
|
||||
|
@ -17,7 +17,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.file_processing.html_utils import strip_excessive_newlines_and_spaces
|
||||
from onyx.utils.logger import setup_logger
|
||||
@ -162,7 +162,7 @@ class LoopioConnector(LoadConnector, PollConnector):
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=str(entry["id"]),
|
||||
sections=[Section(link=link, text=content_text)],
|
||||
sections=[TextSection(link=link, text=content_text)],
|
||||
source=DocumentSource.LOOPIO,
|
||||
semantic_identifier=questions[0],
|
||||
doc_updated_at=latest_time,
|
||||
|
@ -6,6 +6,7 @@ import tempfile
|
||||
from collections.abc import Generator
|
||||
from collections.abc import Iterator
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import ClassVar
|
||||
|
||||
import pywikibot.time # type: ignore[import-untyped]
|
||||
@ -20,7 +21,8 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.mediawiki.family import family_class_dispatch
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -60,14 +62,14 @@ def get_doc_from_page(
|
||||
sections_extracted: textlib.Content = textlib.extract_sections(page_text, site)
|
||||
|
||||
sections = [
|
||||
Section(
|
||||
TextSection(
|
||||
link=f"{page.full_url()}#" + section.heading.replace(" ", "_"),
|
||||
text=section.title + section.content,
|
||||
)
|
||||
for section in sections_extracted.sections
|
||||
]
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
link=page.full_url(),
|
||||
text=sections_extracted.header,
|
||||
)
|
||||
@ -79,7 +81,7 @@ def get_doc_from_page(
|
||||
doc_updated_at=pywikibot_timestamp_to_utc_datetime(
|
||||
page.latest_revision.timestamp
|
||||
),
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
semantic_identifier=page.title(),
|
||||
metadata={"categories": [category.title() for category in page.categories()]},
|
||||
id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}",
|
||||
|
@ -28,9 +28,25 @@ class ConnectorMissingCredentialError(PermissionError):
|
||||
|
||||
|
||||
class Section(BaseModel):
|
||||
"""Base section class with common attributes"""
|
||||
|
||||
link: str | None = None
|
||||
text: str | None = None
|
||||
image_file_name: str | None = None
|
||||
|
||||
|
||||
class TextSection(Section):
|
||||
"""Section containing text content"""
|
||||
|
||||
text: str
|
||||
link: str | None = None
|
||||
image_file_name: str | None = None
|
||||
|
||||
|
||||
class ImageSection(Section):
|
||||
"""Section containing an image reference"""
|
||||
|
||||
image_file_name: str
|
||||
link: str | None = None
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
@ -100,7 +116,7 @@ class DocumentBase(BaseModel):
|
||||
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
|
||||
|
||||
id: str | None = None
|
||||
sections: list[Section]
|
||||
sections: list[TextSection | ImageSection]
|
||||
source: DocumentSource | None = None
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
metadata: dict[str, str | list[str]]
|
||||
@ -150,18 +166,10 @@ class DocumentBase(BaseModel):
|
||||
|
||||
|
||||
class Document(DocumentBase):
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
source: DocumentSource
|
||||
"""Used for Onyx ingestion api, the ID is required"""
|
||||
|
||||
def get_total_char_length(self) -> int:
|
||||
"""Calculate the total character length of the document including sections, metadata, and identifiers."""
|
||||
section_length = sum(len(section.text) for section in self.sections)
|
||||
identifier_length = len(self.semantic_identifier) + len(self.title or "")
|
||||
metadata_length = sum(
|
||||
len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
|
||||
for k, v in self.metadata.items()
|
||||
)
|
||||
return section_length + identifier_length + metadata_length
|
||||
id: str
|
||||
source: DocumentSource
|
||||
|
||||
def to_short_descriptor(self) -> str:
|
||||
"""Used when logging the identity of a document"""
|
||||
@ -185,6 +193,32 @@ class Document(DocumentBase):
|
||||
)
|
||||
|
||||
|
||||
class IndexingDocument(Document):
|
||||
"""Document with processed sections for indexing"""
|
||||
|
||||
processed_sections: list[Section] = []
|
||||
|
||||
def get_total_char_length(self) -> int:
|
||||
"""Get the total character length of the document including processed sections"""
|
||||
title_len = len(self.title or self.semantic_identifier)
|
||||
|
||||
# Use processed_sections if available, otherwise fall back to original sections
|
||||
if self.processed_sections:
|
||||
section_len = sum(
|
||||
len(section.text) if section.text is not None else 0
|
||||
for section in self.processed_sections
|
||||
)
|
||||
else:
|
||||
section_len = sum(
|
||||
len(section.text)
|
||||
if isinstance(section, TextSection) and section.text is not None
|
||||
else 0
|
||||
for section in self.sections
|
||||
)
|
||||
|
||||
return title_len + section_len
|
||||
|
||||
|
||||
class SlimDocument(BaseModel):
|
||||
id: str
|
||||
perm_sync_data: Any | None = None
|
||||
|
@ -25,7 +25,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.batching import batch_generator
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -475,7 +475,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
Document(
|
||||
id=page.id,
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=f"{page.url}#{block.id.replace('-', '')}",
|
||||
text=block.prefix + block.text,
|
||||
)
|
||||
|
@ -23,8 +23,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.onyx_jira.utils import best_effort_basic_expert_info
|
||||
from onyx.connectors.onyx_jira.utils import best_effort_get_field_from_issue
|
||||
from onyx.connectors.onyx_jira.utils import build_jira_client
|
||||
@ -145,7 +145,7 @@ def fetch_jira_issues_batch(
|
||||
|
||||
yield Document(
|
||||
id=page_url,
|
||||
sections=[Section(link=page_url, text=ticket_content)],
|
||||
sections=[TextSection(link=page_url, text=ticket_content)],
|
||||
source=DocumentSource.JIRA,
|
||||
semantic_identifier=f"{issue.key}: {issue.fields.summary}",
|
||||
title=f"{issue.key} {issue.fields.summary}",
|
||||
|
@ -16,7 +16,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@ -110,7 +110,7 @@ class ProductboardConnector(PollConnector):
|
||||
yield Document(
|
||||
id=feature["id"],
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=feature["links"]["html"],
|
||||
text=self._parse_description_html(feature["description"]),
|
||||
)
|
||||
@ -133,7 +133,7 @@ class ProductboardConnector(PollConnector):
|
||||
yield Document(
|
||||
id=component["id"],
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=component["links"]["html"],
|
||||
text=self._parse_description_html(component["description"]),
|
||||
)
|
||||
@ -159,7 +159,7 @@ class ProductboardConnector(PollConnector):
|
||||
yield Document(
|
||||
id=product["id"],
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=product["links"]["html"],
|
||||
text=self._parse_description_html(product["description"]),
|
||||
)
|
||||
@ -189,7 +189,7 @@ class ProductboardConnector(PollConnector):
|
||||
yield Document(
|
||||
id=objective["id"],
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=objective["links"]["html"],
|
||||
text=self._parse_description_html(objective["description"]),
|
||||
)
|
||||
|
@ -13,6 +13,7 @@ from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.salesforce.doc_conversion import convert_sf_object_to_doc
|
||||
from onyx.connectors.salesforce.doc_conversion import ID_PREFIX
|
||||
from onyx.connectors.salesforce.salesforce_calls import fetch_all_csvs_in_parallel
|
||||
@ -218,7 +219,8 @@ if __name__ == "__main__":
|
||||
for doc in doc_batch:
|
||||
section_count += len(doc.sections)
|
||||
for section in doc.sections:
|
||||
text_count += len(section.text)
|
||||
if isinstance(section, TextSection) and section.text is not None:
|
||||
text_count += len(section.text)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Doc count: {doc_count}")
|
||||
|
@ -1,10 +1,12 @@
|
||||
import re
|
||||
from typing import cast
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.salesforce.sqlite_functions import get_child_ids
|
||||
from onyx.connectors.salesforce.sqlite_functions import get_record
|
||||
from onyx.connectors.salesforce.utils import SalesforceObject
|
||||
@ -114,8 +116,8 @@ def _extract_dict_text(raw_dict: dict) -> str:
|
||||
return natural_language_for_dict
|
||||
|
||||
|
||||
def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Section:
|
||||
return Section(
|
||||
def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> TextSection:
|
||||
return TextSection(
|
||||
text=_extract_dict_text(salesforce_object.data),
|
||||
link=f"{base_url}/{salesforce_object.id}",
|
||||
)
|
||||
@ -175,7 +177,7 @@ def convert_sf_object_to_doc(
|
||||
|
||||
doc = Document(
|
||||
id=onyx_salesforce_id,
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.SALESFORCE,
|
||||
semantic_identifier=extracted_semantic_identifier,
|
||||
doc_updated_at=extracted_doc_updated_at,
|
||||
|
@ -19,7 +19,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -55,7 +55,7 @@ def _convert_driveitem_to_document(
|
||||
|
||||
doc = Document(
|
||||
id=driveitem.id,
|
||||
sections=[Section(link=driveitem.web_url, text=file_text)],
|
||||
sections=[TextSection(link=driveitem.web_url, text=file_text)],
|
||||
source=DocumentSource.SHAREPOINT,
|
||||
semantic_identifier=driveitem.name,
|
||||
doc_updated_at=driveitem.last_modified_datetime.replace(tzinfo=timezone.utc),
|
||||
|
@ -19,8 +19,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -212,7 +212,7 @@ class SlabConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=post_id, # can't be url as this changes with the post title
|
||||
sections=[Section(link=page_url, text=content_text)],
|
||||
sections=[TextSection(link=page_url, text=content_text)],
|
||||
source=DocumentSource.SLAB,
|
||||
semantic_identifier=post["title"],
|
||||
metadata={},
|
||||
|
@ -34,8 +34,8 @@ from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import EntityFailure
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.slack.utils import expert_info_from_slack_id
|
||||
from onyx.connectors.slack.utils import get_message_link
|
||||
from onyx.connectors.slack.utils import make_paginated_slack_api_call_w_retries
|
||||
@ -211,7 +211,7 @@ def thread_to_doc(
|
||||
return Document(
|
||||
id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=get_message_link(event=m, client=client, channel_id=channel_id),
|
||||
text=slack_cleaner.index_clean(cast(str, m["text"])),
|
||||
)
|
||||
|
@ -24,7 +24,7 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@ -165,7 +165,7 @@ def _convert_thread_to_document(
|
||||
|
||||
doc = Document(
|
||||
id=post_id,
|
||||
sections=[Section(link=web_url, text=thread_text)],
|
||||
sections=[TextSection(link=web_url, text=thread_text)],
|
||||
source=DocumentSource.TEAMS,
|
||||
semantic_identifier=semantic_string,
|
||||
title="", # teams threads don't really have a "title"
|
||||
|
@ -1,46 +0,0 @@
|
||||
"""
|
||||
Mixin for connectors that need vision capabilities.
|
||||
"""
|
||||
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
|
||||
from onyx.llm.factory import get_default_llm_with_vision
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class VisionEnabledConnector:
|
||||
"""
|
||||
Mixin for connectors that need vision capabilities.
|
||||
|
||||
This mixin provides a standard way to initialize a vision-capable LLM
|
||||
for image analysis during indexing.
|
||||
|
||||
Usage:
|
||||
class MyConnector(LoadConnector, VisionEnabledConnector):
|
||||
def __init__(self, ...):
|
||||
super().__init__(...)
|
||||
self.initialize_vision_llm()
|
||||
"""
|
||||
|
||||
def initialize_vision_llm(self) -> None:
|
||||
"""
|
||||
Initialize a vision-capable LLM if enabled by configuration.
|
||||
|
||||
Sets self.image_analysis_llm to the LLM instance or None if disabled.
|
||||
"""
|
||||
self.image_analysis_llm: LLM | None = None
|
||||
|
||||
if get_image_extraction_and_analysis_enabled():
|
||||
try:
|
||||
self.image_analysis_llm = get_default_llm_with_vision()
|
||||
if self.image_analysis_llm is None:
|
||||
logger.warning(
|
||||
"No LLM with vision found; image summarization will be disabled"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to initialize vision LLM due to an error: {str(e)}. "
|
||||
"Image summarization will be disabled."
|
||||
)
|
||||
self.image_analysis_llm = None
|
@ -32,7 +32,7 @@ from onyx.connectors.exceptions import UnexpectedValidationError
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.html_utils import web_html_cleanup
|
||||
from onyx.utils.logger import setup_logger
|
||||
@ -341,7 +341,7 @@ class WebConnector(LoadConnector):
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=initial_url,
|
||||
sections=[Section(link=initial_url, text=page_text)],
|
||||
sections=[TextSection(link=initial_url, text=page_text)],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=initial_url.split("/")[-1],
|
||||
metadata=metadata,
|
||||
@ -443,7 +443,7 @@ class WebConnector(LoadConnector):
|
||||
Document(
|
||||
id=initial_url,
|
||||
sections=[
|
||||
Section(link=initial_url, text=parsed_html.cleaned_text)
|
||||
TextSection(link=initial_url, text=parsed_html.cleaned_text)
|
||||
],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=parsed_html.title or initial_url,
|
||||
|
@ -28,7 +28,7 @@ from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -104,7 +104,7 @@ def scrape_page_posts(
|
||||
# id. We may want to de-dupe this stuff inside the indexing service.
|
||||
document = Document(
|
||||
id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}",
|
||||
sections=[Section(link=url, text=post_text)],
|
||||
sections=[TextSection(link=url, text=post_text)],
|
||||
title=title,
|
||||
source=DocumentSource.XENFORO,
|
||||
semantic_identifier=title,
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterator
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import requests
|
||||
|
||||
@ -17,8 +18,8 @@ from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
@ -168,8 +169,8 @@ def _article_to_document(
|
||||
return new_author_mapping, Document(
|
||||
id=f"article:{article['id']}",
|
||||
sections=[
|
||||
Section(
|
||||
link=article.get("html_url"),
|
||||
TextSection(
|
||||
link=cast(str, article.get("html_url")),
|
||||
text=parse_html_page_basic(article["body"]),
|
||||
)
|
||||
],
|
||||
@ -268,7 +269,7 @@ def _ticket_to_document(
|
||||
|
||||
return new_author_mapping, Document(
|
||||
id=f"zendesk_ticket_{ticket['id']}",
|
||||
sections=[Section(link=ticket_display_url, text=full_text)],
|
||||
sections=[TextSection(link=ticket_display_url, text=full_text)],
|
||||
source=DocumentSource.ZENDESK,
|
||||
semantic_identifier=f"Ticket #{ticket['id']}: {subject or 'No Subject'}",
|
||||
doc_updated_at=update_time,
|
||||
|
@ -20,7 +20,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.zulip.schemas import GetMessagesResponse
|
||||
from onyx.connectors.zulip.schemas import Message
|
||||
from onyx.connectors.zulip.utils import build_search_narrow
|
||||
@ -161,7 +161,7 @@ class ZulipConnector(LoadConnector, PollConnector):
|
||||
return Document(
|
||||
id=f"{message.stream_id}__{message.id}",
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
link=self._message_to_narrow_link(message),
|
||||
text=text,
|
||||
)
|
||||
|
@ -67,6 +67,9 @@ def read_lobj(
|
||||
use_tempfile: bool = False,
|
||||
) -> IO:
|
||||
pg_conn = get_pg_conn_from_session(db_session)
|
||||
# Ensure we're using binary mode by default for large objects
|
||||
if mode is None:
|
||||
mode = "rb"
|
||||
large_object = (
|
||||
pg_conn.lobject(lobj_oid, mode=mode) if mode else pg_conn.lobject(lobj_oid)
|
||||
)
|
||||
@ -81,6 +84,7 @@ def read_lobj(
|
||||
temp_file.seek(0)
|
||||
return temp_file
|
||||
else:
|
||||
# Ensure we're getting raw bytes without text decoding
|
||||
return BytesIO(large_object.read())
|
||||
|
||||
|
||||
|
@ -2,12 +2,9 @@ from typing import Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -18,12 +15,12 @@ def store_image_and_create_section(
|
||||
image_data: bytes,
|
||||
file_name: str,
|
||||
display_name: str,
|
||||
media_type: str = "image/unknown",
|
||||
llm: LLM | None = None,
|
||||
link: str | None = None,
|
||||
media_type: str = "application/octet-stream",
|
||||
file_origin: FileOrigin = FileOrigin.OTHER,
|
||||
) -> Tuple[Section, str | None]:
|
||||
) -> Tuple[ImageSection, str | None]:
|
||||
"""
|
||||
Stores an image in PGFileStore and creates a Section object with optional summarization.
|
||||
Stores an image in PGFileStore and creates an ImageSection object without summarization.
|
||||
|
||||
Args:
|
||||
db_session: Database session
|
||||
@ -31,12 +28,11 @@ def store_image_and_create_section(
|
||||
file_name: Base identifier for the file
|
||||
display_name: Human-readable name for the image
|
||||
media_type: MIME type of the image
|
||||
llm: Optional LLM with vision capabilities for summarization
|
||||
file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Section object with image reference and optional summary text
|
||||
- ImageSection object with image reference
|
||||
- The file_name in PGFileStore or None if storage failed
|
||||
"""
|
||||
# Storage logic
|
||||
@ -53,18 +49,10 @@ def store_image_and_create_section(
|
||||
stored_file_name = pgfilestore.file_name
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store image: {e}")
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise
|
||||
return Section(text=""), None
|
||||
|
||||
# Summarization logic
|
||||
summary_text = ""
|
||||
if llm:
|
||||
summary_text = (
|
||||
summarize_image_with_error_handling(llm, image_data, display_name) or ""
|
||||
)
|
||||
raise e
|
||||
|
||||
# Create an ImageSection with empty text (will be filled by LLM later in the pipeline)
|
||||
return (
|
||||
Section(text=summary_text, image_file_name=stored_file_name),
|
||||
ImageSection(image_file_name=stored_file_name, link=link),
|
||||
stored_file_name,
|
||||
)
|
||||
|
@ -9,7 +9,8 @@ from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_metadata_keys_to_ignore,
|
||||
)
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
@ -64,7 +65,7 @@ def _get_metadata_suffix_for_document_index(
|
||||
|
||||
def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwareChunk:
|
||||
"""
|
||||
Combines multiple DocAwareChunks into one large chunk (for “multipass” mode),
|
||||
Combines multiple DocAwareChunks into one large chunk (for "multipass" mode),
|
||||
appending the content and adjusting source_links accordingly.
|
||||
"""
|
||||
merged_chunk = DocAwareChunk(
|
||||
@ -98,7 +99,7 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar
|
||||
|
||||
def generate_large_chunks(chunks: list[DocAwareChunk]) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Generates larger “grouped” chunks by combining sets of smaller chunks.
|
||||
Generates larger "grouped" chunks by combining sets of smaller chunks.
|
||||
"""
|
||||
large_chunks = []
|
||||
for idx, i in enumerate(range(0, len(chunks), LARGE_CHUNK_RATIO)):
|
||||
@ -185,7 +186,7 @@ class Chunker:
|
||||
|
||||
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
|
||||
"""
|
||||
For “multipass” mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
|
||||
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
|
||||
"""
|
||||
if self.mini_chunk_splitter and chunk_text.strip():
|
||||
return self.mini_chunk_splitter.split_text(chunk_text)
|
||||
@ -194,7 +195,7 @@ class Chunker:
|
||||
# ADDED: extra param image_url to store in the chunk
|
||||
def _create_chunk(
|
||||
self,
|
||||
document: Document,
|
||||
document: IndexingDocument,
|
||||
chunks_list: list[DocAwareChunk],
|
||||
text: str,
|
||||
links: dict[int, str],
|
||||
@ -225,7 +226,29 @@ class Chunker:
|
||||
|
||||
def _chunk_document(
|
||||
self,
|
||||
document: Document,
|
||||
document: IndexingDocument,
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Legacy method for backward compatibility.
|
||||
Calls _chunk_document_with_sections with document.sections.
|
||||
"""
|
||||
return self._chunk_document_with_sections(
|
||||
document,
|
||||
document.processed_sections,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
content_token_limit,
|
||||
)
|
||||
|
||||
def _chunk_document_with_sections(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
@ -233,17 +256,16 @@ class Chunker:
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Loops through sections of the document, converting them into one or more chunks.
|
||||
If a section has an image_link, we treat it as a dedicated chunk.
|
||||
Works with processed sections that are base Section objects.
|
||||
"""
|
||||
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
|
||||
for section_idx, section in enumerate(document.sections):
|
||||
section_text = clean_text(section.text)
|
||||
for section_idx, section in enumerate(sections):
|
||||
# Get section text and other attributes
|
||||
section_text = clean_text(section.text or "")
|
||||
section_link_text = section.link or ""
|
||||
# ADDED: if the Section has an image link
|
||||
image_url = section.image_file_name
|
||||
|
||||
# If there is no useful content, skip
|
||||
@ -254,7 +276,7 @@ class Chunker:
|
||||
)
|
||||
continue
|
||||
|
||||
# CASE 1: If this is an image section, force a separate chunk
|
||||
# CASE 1: If this section has an image, force a separate chunk
|
||||
if image_url:
|
||||
# First, if we have any partially built text chunk, finalize it
|
||||
if chunk_text.strip():
|
||||
@ -271,15 +293,13 @@ class Chunker:
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# Create a chunk specifically for this image
|
||||
# (If the section has text describing the image, use that as content)
|
||||
# Create a chunk specifically for this image section
|
||||
# (Using the text summary that was generated during processing)
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
section_text,
|
||||
links={0: section_link_text}
|
||||
if section_link_text
|
||||
else {}, # No text offsets needed for images
|
||||
links={0: section_link_text} if section_link_text else {},
|
||||
image_file_name=image_url,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
@ -384,7 +404,9 @@ class Chunker:
|
||||
)
|
||||
return chunks
|
||||
|
||||
def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
|
||||
def _handle_single_document(
|
||||
self, document: IndexingDocument
|
||||
) -> list[DocAwareChunk]:
|
||||
# Specifically for reproducing an issue with gmail
|
||||
if document.source == DocumentSource.GMAIL:
|
||||
logger.debug(f"Chunking {document.semantic_identifier}")
|
||||
@ -420,26 +442,31 @@ class Chunker:
|
||||
title_prefix = ""
|
||||
metadata_suffix_semantic = ""
|
||||
|
||||
# Chunk the document
|
||||
normal_chunks = self._chunk_document(
|
||||
# Use processed_sections if available (IndexingDocument), otherwise use original sections
|
||||
sections_to_chunk = document.processed_sections
|
||||
|
||||
normal_chunks = self._chunk_document_with_sections(
|
||||
document,
|
||||
sections_to_chunk,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
content_token_limit,
|
||||
)
|
||||
|
||||
# Optional “multipass” large chunk creation
|
||||
# Optional "multipass" large chunk creation
|
||||
if self.enable_multipass and self.enable_large_chunks:
|
||||
large_chunks = generate_large_chunks(normal_chunks)
|
||||
normal_chunks.extend(large_chunks)
|
||||
|
||||
return normal_chunks
|
||||
|
||||
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
|
||||
def chunk(self, documents: list[IndexingDocument]) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Takes in a list of documents and chunks them into smaller chunks for indexing
|
||||
while persisting the document metadata.
|
||||
|
||||
Works with both standard Document objects and IndexingDocument objects with processed_sections.
|
||||
"""
|
||||
final_chunks: list[DocAwareChunk] = []
|
||||
for document in documents:
|
||||
|
@ -10,13 +10,18 @@ from onyx.access.access import get_access_for_documents
|
||||
from onyx.access.models import DocumentAccess
|
||||
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
|
||||
from onyx.configs.constants import DEFAULT_BOOST
|
||||
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import IndexAttemptMetadata
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.document import fetch_chunk_counts_for_documents
|
||||
from onyx.db.document import get_documents_by_ids
|
||||
from onyx.db.document import mark_document_as_indexed_for_cc_pair__no_commit
|
||||
@ -27,7 +32,10 @@ from onyx.db.document import update_docs_updated_at__no_commit
|
||||
from onyx.db.document import upsert_document_by_connector_credential_pair
|
||||
from onyx.db.document import upsert_documents
|
||||
from onyx.db.document_set import fetch_document_sets_for_documents
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.models import Document as DBDocument
|
||||
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
|
||||
from onyx.db.pg_file_store import read_lobj
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.tag import create_or_add_document_tag
|
||||
from onyx.db.tag import create_or_add_document_tag_list
|
||||
@ -37,6 +45,7 @@ from onyx.document_index.document_index_utils import (
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentMetadata
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.indexing.embedder import embed_chunks_with_failure_handling
|
||||
from onyx.indexing.embedder import IndexingEmbedder
|
||||
@ -44,6 +53,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
|
||||
from onyx.llm.factory import get_default_llm_with_vision
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.timing import log_function_time
|
||||
|
||||
@ -53,6 +63,7 @@ logger = setup_logger()
|
||||
class DocumentBatchPrepareContext(BaseModel):
|
||||
updatable_docs: list[Document]
|
||||
id_to_db_doc_map: dict[str, DBDocument]
|
||||
indexable_docs: list[IndexingDocument] = []
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
|
||||
@ -265,7 +276,12 @@ def index_doc_batch_prepare(
|
||||
def filter_documents(document_batch: list[Document]) -> list[Document]:
|
||||
documents: list[Document] = []
|
||||
for document in document_batch:
|
||||
empty_contents = not any(section.text.strip() for section in document.sections)
|
||||
empty_contents = not any(
|
||||
isinstance(section, TextSection)
|
||||
and section.text is not None
|
||||
and section.text.strip()
|
||||
for section in document.sections
|
||||
)
|
||||
if (
|
||||
(not document.title or not document.title.strip())
|
||||
and not document.semantic_identifier.strip()
|
||||
@ -288,7 +304,12 @@ def filter_documents(document_batch: list[Document]) -> list[Document]:
|
||||
)
|
||||
continue
|
||||
|
||||
section_chars = sum(len(section.text) for section in document.sections)
|
||||
section_chars = sum(
|
||||
len(section.text)
|
||||
if isinstance(section, TextSection) and section.text is not None
|
||||
else 0
|
||||
for section in document.sections
|
||||
)
|
||||
if (
|
||||
MAX_DOCUMENT_CHARS
|
||||
and len(document.title or document.semantic_identifier) + section_chars
|
||||
@ -308,6 +329,121 @@ def filter_documents(document_batch: list[Document]) -> list[Document]:
|
||||
return documents
|
||||
|
||||
|
||||
def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
"""
|
||||
Process all sections in documents by:
|
||||
1. Converting both TextSection and ImageSection objects to base Section objects
|
||||
2. Processing ImageSections to generate text summaries using a vision-capable LLM
|
||||
3. Returning IndexingDocument objects with both original and processed sections
|
||||
|
||||
Args:
|
||||
documents: List of documents with TextSection | ImageSection objects
|
||||
|
||||
Returns:
|
||||
List of IndexingDocument objects with processed_sections as list[Section]
|
||||
"""
|
||||
# Check if image extraction and analysis is enabled before trying to get a vision LLM
|
||||
if not get_image_extraction_and_analysis_enabled():
|
||||
llm = None
|
||||
else:
|
||||
# Only get the vision LLM if image processing is enabled
|
||||
llm = get_default_llm_with_vision()
|
||||
|
||||
if not llm:
|
||||
logger.warning(
|
||||
"No vision-capable LLM available. Image sections will not be processed."
|
||||
)
|
||||
|
||||
# Even without LLM, we still convert to IndexingDocument with base Sections
|
||||
return [
|
||||
IndexingDocument(
|
||||
**document.dict(),
|
||||
processed_sections=[
|
||||
Section(
|
||||
text=section.text if isinstance(section, TextSection) else None,
|
||||
link=section.link,
|
||||
image_file_name=section.image_file_name
|
||||
if isinstance(section, ImageSection)
|
||||
else None,
|
||||
)
|
||||
for section in document.sections
|
||||
],
|
||||
)
|
||||
for document in documents
|
||||
]
|
||||
|
||||
indexed_documents: list[IndexingDocument] = []
|
||||
|
||||
for document in documents:
|
||||
processed_sections: list[Section] = []
|
||||
|
||||
for section in document.sections:
|
||||
# For ImageSection, process and create base Section with both text and image_file_name
|
||||
if isinstance(section, ImageSection):
|
||||
# Default section with image path preserved
|
||||
processed_section = Section(
|
||||
link=section.link,
|
||||
image_file_name=section.image_file_name,
|
||||
text=None, # Will be populated if summarization succeeds
|
||||
)
|
||||
|
||||
# Try to get image summary
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
pgfilestore = get_pgfilestore_by_file_name(
|
||||
file_name=section.image_file_name, db_session=db_session
|
||||
)
|
||||
|
||||
if not pgfilestore:
|
||||
logger.warning(
|
||||
f"Image file {section.image_file_name} not found in PGFileStore"
|
||||
)
|
||||
|
||||
processed_section.text = "[Image could not be processed]"
|
||||
else:
|
||||
# Get the image data
|
||||
image_data_io = read_lobj(
|
||||
pgfilestore.lobj_oid, db_session, mode="rb"
|
||||
)
|
||||
pgfilestore_data = image_data_io.read()
|
||||
summary = summarize_image_with_error_handling(
|
||||
llm=llm,
|
||||
image_data=pgfilestore_data,
|
||||
context_name=pgfilestore.display_name or "Image",
|
||||
)
|
||||
|
||||
if summary:
|
||||
processed_section.text = summary
|
||||
else:
|
||||
processed_section.text = (
|
||||
"[Image could not be summarized]"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image section: {e}")
|
||||
processed_section.text = "[Error processing image]"
|
||||
|
||||
processed_sections.append(processed_section)
|
||||
|
||||
# For TextSection, create a base Section with text and link
|
||||
elif isinstance(section, TextSection):
|
||||
processed_section = Section(
|
||||
text=section.text, link=section.link, image_file_name=None
|
||||
)
|
||||
processed_sections.append(processed_section)
|
||||
|
||||
# If it's already a base Section (unlikely), just append it
|
||||
else:
|
||||
processed_sections.append(section)
|
||||
|
||||
# Create IndexingDocument with original sections and processed_sections
|
||||
indexed_document = IndexingDocument(
|
||||
**document.dict(), processed_sections=processed_sections
|
||||
)
|
||||
indexed_documents.append(indexed_document)
|
||||
|
||||
return indexed_documents
|
||||
|
||||
|
||||
@log_function_time(debug_only=True)
|
||||
def index_doc_batch(
|
||||
*,
|
||||
@ -362,19 +498,23 @@ def index_doc_batch(
|
||||
failures=[],
|
||||
)
|
||||
|
||||
# Convert documents to IndexingDocument objects with processed section
|
||||
# logger.debug("Processing image sections")
|
||||
ctx.indexable_docs = process_image_sections(ctx.updatable_docs)
|
||||
|
||||
doc_descriptors = [
|
||||
{
|
||||
"doc_id": doc.id,
|
||||
"doc_length": doc.get_total_char_length(),
|
||||
}
|
||||
for doc in ctx.updatable_docs
|
||||
for doc in ctx.indexable_docs
|
||||
]
|
||||
logger.debug(f"Starting indexing process for documents: {doc_descriptors}")
|
||||
|
||||
logger.debug("Starting chunking")
|
||||
# NOTE: no special handling for failures here, since the chunker is not
|
||||
# a common source of failure for the indexing pipeline
|
||||
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
|
||||
chunks: list[DocAwareChunk] = chunker.chunk(ctx.indexable_docs)
|
||||
|
||||
logger.debug("Starting embedding")
|
||||
chunks_with_embeddings, embedding_failures = (
|
||||
|
@ -15,7 +15,7 @@ from onyx.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import IndexAttemptMetadata
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.connector import check_connectors_exist
|
||||
from onyx.db.connector import create_connector
|
||||
from onyx.db.connector_credential_pair import add_credential_to_connector
|
||||
@ -55,7 +55,7 @@ def _create_indexable_chunks(
|
||||
# The section is not really used past this point since we have already done the other processing
|
||||
# for the chunking and embedding.
|
||||
sections=[
|
||||
Section(
|
||||
TextSection(
|
||||
text=preprocessed_doc["content"],
|
||||
link=preprocessed_doc["url"],
|
||||
image_file_name=None,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
from typing import cast
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
@ -7,7 +8,8 @@ from pydantic import BaseModel
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.airtable.airtable_connector import AirtableConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
BASE_VIEW_ID = "viwVUEJjWPd8XYjh8"
|
||||
|
||||
@ -76,11 +78,11 @@ def create_test_document(
|
||||
if not all_fields_as_metadata:
|
||||
sections.extend(
|
||||
[
|
||||
Section(
|
||||
TextSection(
|
||||
text=f"Title:\n------------------------\n{title}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
Section(
|
||||
TextSection(
|
||||
text=f"Description:\n------------------------\n{description}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
@ -90,7 +92,7 @@ def create_test_document(
|
||||
if attachments:
|
||||
for attachment_text, attachment_link in attachments:
|
||||
sections.append(
|
||||
Section(
|
||||
TextSection(
|
||||
text=f"Attachment:\n------------------------\n{attachment_text}\n------------------------",
|
||||
link=attachment_link,
|
||||
),
|
||||
@ -122,7 +124,7 @@ def create_test_document(
|
||||
|
||||
return Document(
|
||||
id=f"airtable__{id}",
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.AIRTABLE,
|
||||
semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
|
||||
metadata=metadata,
|
||||
|
@ -49,6 +49,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
|
||||
|
||||
# Content specific checks
|
||||
content = section.text
|
||||
assert content is not None, "Section text should not be None"
|
||||
|
||||
# Check for specific content elements
|
||||
assert "* Fruit Shopping List:" in content
|
||||
@ -82,6 +83,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
|
||||
assert nested1.semantic_identifier == "Nested1"
|
||||
assert len(nested1.sections) == 1
|
||||
# extra newlines at the end, remove them to make test easier
|
||||
assert nested1.sections[0].text is not None
|
||||
assert nested1.sections[0].text.strip() == "nested1"
|
||||
assert nested1.source == DocumentSource.GITBOOK
|
||||
|
||||
@ -89,6 +91,7 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
|
||||
assert nested2.id.startswith("gitbook-")
|
||||
assert nested2.semantic_identifier == "Nested2"
|
||||
assert len(nested2.sections) == 1
|
||||
assert nested2.sections[0].text is not None
|
||||
assert nested2.sections[0].text.strip() == "nested2"
|
||||
assert nested2.source == DocumentSource.GITBOOK
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
from collections.abc import Sequence
|
||||
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
ALL_FILES = list(range(0, 60))
|
||||
SHARED_DRIVE_FILES = list(range(20, 25))
|
||||
@ -177,7 +178,13 @@ def assert_retrieved_docs_match_expected(
|
||||
)
|
||||
valid_retrieved_texts = set(
|
||||
[
|
||||
" - ".join([section.text for section in doc.sections])
|
||||
" - ".join(
|
||||
[
|
||||
section.text
|
||||
for section in doc.sections
|
||||
if isinstance(section, TextSection) and section.text is not None
|
||||
]
|
||||
)
|
||||
for doc in valid_retrieved_docs
|
||||
]
|
||||
)
|
||||
|
@ -97,6 +97,7 @@ def test_notion_connector_basic(notion_connector: NotionConnector) -> None:
|
||||
assert child2_section.link.startswith("https://www.notion.so/")
|
||||
|
||||
# Database section checks for child2
|
||||
assert child2_db_section.text is not None
|
||||
assert child2_db_section.text.strip() != "" # Should contain some database content
|
||||
assert child2_db_section.link is not None
|
||||
assert child2_db_section.link.startswith("https://www.notion.so/")
|
||||
|
@ -60,6 +60,7 @@ def verify_document_content(doc: Document, expected: ExpectedDocument) -> None:
|
||||
"""Verify a document matches its expected content."""
|
||||
assert doc.semantic_identifier == expected.semantic_identifier
|
||||
assert len(doc.sections) == 1
|
||||
assert doc.sections[0].text is not None
|
||||
assert expected.content in doc.sections[0].text
|
||||
verify_document_metadata(doc)
|
||||
|
||||
|
@ -63,6 +63,7 @@ def test_slab_connector_basic(slab_connector: SlabConnector) -> None:
|
||||
assert len(target_test_doc.sections) == 1
|
||||
section = target_test_doc.sections[0]
|
||||
# Need to replace the weird apostrophe with a normal one
|
||||
assert section.text is not None
|
||||
assert section.text.replace("\u2019", "'") == desired_test_data["section_text"]
|
||||
assert section.link == desired_test_data["link"]
|
||||
|
||||
|
@ -32,6 +32,7 @@ def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert doc.sections[0].text is not None
|
||||
assert EXPECTED_QUOTE in doc.sections[0].text
|
||||
|
||||
|
||||
@ -45,4 +46,5 @@ def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert doc.sections[0].text is not None
|
||||
assert EXPECTED_QUOTE not in doc.sections[0].text
|
||||
|
@ -6,7 +6,7 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
|
||||
def create_test_document(
|
||||
@ -28,7 +28,7 @@ def create_test_document(
|
||||
doc_id = doc_id or f"test-doc-{uuid.uuid4()}"
|
||||
return Document(
|
||||
id=doc_id,
|
||||
sections=[Section(text=text, link=link)],
|
||||
sections=[TextSection(text=text, link=link)],
|
||||
source=source,
|
||||
semantic_identifier=doc_id,
|
||||
doc_updated_at=datetime.now(timezone.utc),
|
||||
|
@ -83,6 +83,7 @@ def test_fetch_jira_issues_batch_small_ticket(
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].id.endswith("/SMALL-1")
|
||||
assert docs[0].sections[0].text is not None
|
||||
assert "Small description" in docs[0].sections[0].text
|
||||
assert "Small comment 1" in docs[0].sections[0].text
|
||||
assert "Small comment 2" in docs[0].sections[0].text
|
||||
|
@ -99,7 +99,8 @@ def test_get_doc_from_page(site: pywikibot.Site) -> None:
|
||||
doc.sections, test_page._sections_helper + [test_page.header]
|
||||
):
|
||||
assert (
|
||||
section.text.strip() == expected_section.strip()
|
||||
section.text is not None
|
||||
and section.text.strip() == expected_section.strip()
|
||||
) # Extra whitespace before/after is okay
|
||||
assert section.link and section.link.startswith(test_page.full_url())
|
||||
assert doc.semantic_identifier == test_page.title()
|
||||
|
@ -2,9 +2,10 @@ import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
||||
from onyx.indexing.indexing_pipeline import process_image_sections
|
||||
from tests.unit.onyx.indexing.conftest import MockHeartbeat
|
||||
|
||||
|
||||
@ -35,19 +36,20 @@ def test_chunk_document(embedder: DefaultIndexingEmbedder) -> None:
|
||||
metadata={"tags": ["tag1", "tag2"]},
|
||||
doc_updated_at=None,
|
||||
sections=[
|
||||
Section(text=short_section_1, link="link1"),
|
||||
Section(text=short_section_2, link="link2"),
|
||||
Section(text=long_section, link="link3"),
|
||||
Section(text=short_section_3, link="link4"),
|
||||
Section(text=short_section_4, link="link5"),
|
||||
TextSection(text=short_section_1, link="link1"),
|
||||
TextSection(text=short_section_2, link="link2"),
|
||||
TextSection(text=long_section, link="link3"),
|
||||
TextSection(text=short_section_3, link="link4"),
|
||||
TextSection(text=short_section_4, link="link5"),
|
||||
],
|
||||
)
|
||||
indexing_documents = process_image_sections([document])
|
||||
|
||||
chunker = Chunker(
|
||||
tokenizer=embedder.embedding_model.tokenizer,
|
||||
enable_multipass=False,
|
||||
)
|
||||
chunks = chunker.chunk([document])
|
||||
chunks = chunker.chunk(indexing_documents)
|
||||
|
||||
assert len(chunks) == 5
|
||||
assert short_section_1 in chunks[0].content
|
||||
@ -67,9 +69,10 @@ def test_chunker_heartbeat(
|
||||
metadata={"tags": ["tag1", "tag2"]},
|
||||
doc_updated_at=None,
|
||||
sections=[
|
||||
Section(text="This is a short section.", link="link1"),
|
||||
TextSection(text="This is a short section.", link="link1"),
|
||||
],
|
||||
)
|
||||
indexing_documents = process_image_sections([document])
|
||||
|
||||
chunker = Chunker(
|
||||
tokenizer=embedder.embedding_model.tokenizer,
|
||||
@ -77,7 +80,7 @@ def test_chunker_heartbeat(
|
||||
callback=mock_heartbeat,
|
||||
)
|
||||
|
||||
chunks = chunker.chunk([document])
|
||||
chunks = chunker.chunk(indexing_documents)
|
||||
|
||||
assert mock_heartbeat.call_count == 1
|
||||
assert len(chunks) > 0
|
||||
|
@ -6,7 +6,7 @@ import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
||||
from onyx.indexing.models import ChunkEmbedding
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
@ -45,7 +45,7 @@ def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> N
|
||||
metadata={"tags": ["tag1", "tag2"]},
|
||||
doc_updated_at=None,
|
||||
sections=[
|
||||
Section(text="This is a short section.", link="link1"),
|
||||
TextSection(text="This is a short section.", link="link1"),
|
||||
],
|
||||
)
|
||||
chunks: list[DocAwareChunk] = [
|
||||
|
@ -1,9 +1,11 @@
|
||||
from typing import cast
|
||||
from typing import List
|
||||
|
||||
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentSource
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.indexing.indexing_pipeline import filter_documents
|
||||
|
||||
|
||||
@ -11,15 +13,15 @@ def create_test_document(
|
||||
doc_id: str = "test_id",
|
||||
title: str | None = "Test Title",
|
||||
semantic_id: str = "test_semantic_id",
|
||||
sections: List[Section] | None = None,
|
||||
sections: List[TextSection] | None = None,
|
||||
) -> Document:
|
||||
if sections is None:
|
||||
sections = [Section(text="Test content", link="test_link")]
|
||||
sections = [TextSection(text="Test content", link="test_link")]
|
||||
return Document(
|
||||
id=doc_id,
|
||||
title=title,
|
||||
semantic_identifier=semantic_id,
|
||||
sections=sections,
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.FILE,
|
||||
metadata={},
|
||||
)
|
||||
@ -27,7 +29,7 @@ def create_test_document(
|
||||
|
||||
def test_filter_documents_empty_title_and_content() -> None:
|
||||
doc = create_test_document(
|
||||
title="", semantic_id="", sections=[Section(text="", link="test_link")]
|
||||
title="", semantic_id="", sections=[TextSection(text="", link="test_link")]
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 0
|
||||
@ -35,7 +37,7 @@ def test_filter_documents_empty_title_and_content() -> None:
|
||||
|
||||
def test_filter_documents_empty_title_with_content() -> None:
|
||||
doc = create_test_document(
|
||||
title="", sections=[Section(text="Valid content", link="test_link")]
|
||||
title="", sections=[TextSection(text="Valid content", link="test_link")]
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 1
|
||||
@ -44,7 +46,7 @@ def test_filter_documents_empty_title_with_content() -> None:
|
||||
|
||||
def test_filter_documents_empty_content_with_title() -> None:
|
||||
doc = create_test_document(
|
||||
title="Valid Title", sections=[Section(text="", link="test_link")]
|
||||
title="Valid Title", sections=[TextSection(text="", link="test_link")]
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 1
|
||||
@ -55,14 +57,15 @@ def test_filter_documents_exceeding_max_chars() -> None:
|
||||
if not MAX_DOCUMENT_CHARS: # Skip if no max chars configured
|
||||
return
|
||||
long_text = "a" * (MAX_DOCUMENT_CHARS + 1)
|
||||
doc = create_test_document(sections=[Section(text=long_text, link="test_link")])
|
||||
doc = create_test_document(sections=[TextSection(text=long_text, link="test_link")])
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
def test_filter_documents_valid_document() -> None:
|
||||
doc = create_test_document(
|
||||
title="Valid Title", sections=[Section(text="Valid content", link="test_link")]
|
||||
title="Valid Title",
|
||||
sections=[TextSection(text="Valid content", link="test_link")],
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 1
|
||||
@ -72,7 +75,9 @@ def test_filter_documents_valid_document() -> None:
|
||||
|
||||
def test_filter_documents_whitespace_only() -> None:
|
||||
doc = create_test_document(
|
||||
title=" ", semantic_id=" ", sections=[Section(text=" ", link="test_link")]
|
||||
title=" ",
|
||||
semantic_id=" ",
|
||||
sections=[TextSection(text=" ", link="test_link")],
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 0
|
||||
@ -82,7 +87,7 @@ def test_filter_documents_semantic_id_no_title() -> None:
|
||||
doc = create_test_document(
|
||||
title=None,
|
||||
semantic_id="Valid Semantic ID",
|
||||
sections=[Section(text="Valid content", link="test_link")],
|
||||
sections=[TextSection(text="Valid content", link="test_link")],
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
assert len(result) == 1
|
||||
@ -92,9 +97,9 @@ def test_filter_documents_semantic_id_no_title() -> None:
|
||||
def test_filter_documents_multiple_sections() -> None:
|
||||
doc = create_test_document(
|
||||
sections=[
|
||||
Section(text="Content 1", link="test_link"),
|
||||
Section(text="Content 2", link="test_link"),
|
||||
Section(text="Content 3", link="test_link"),
|
||||
TextSection(text="Content 1", link="test_link"),
|
||||
TextSection(text="Content 2", link="test_link"),
|
||||
TextSection(text="Content 3", link="test_link"),
|
||||
]
|
||||
)
|
||||
result = filter_documents([doc])
|
||||
@ -106,7 +111,7 @@ def test_filter_documents_multiple_documents() -> None:
|
||||
docs = [
|
||||
create_test_document(doc_id="1", title="Title 1"),
|
||||
create_test_document(
|
||||
doc_id="2", title="", sections=[Section(text="", link="test_link")]
|
||||
doc_id="2", title="", sections=[TextSection(text="", link="test_link")]
|
||||
), # Should be filtered
|
||||
create_test_document(doc_id="3", title="Title 3"),
|
||||
]
|
||||
|
@ -12,4 +12,4 @@
|
||||
}
|
||||
],
|
||||
"origins": []
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user