Notion improvement (#4306)

* Notion connector improvements

* Enable recursive index by default

* Small tweak
This commit is contained in:
Chris Weaver 2025-03-19 16:16:05 -07:00 committed by GitHub
parent 72bf427cc2
commit 5dda53eec3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 53 additions and 37 deletions

View File

@ -347,8 +347,8 @@ HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
HtmlBasedConnectorTransformLinksStrategy.STRIP, HtmlBasedConnectorTransformLinksStrategy.STRIP,
) )
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = ( NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP = (
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower() os.environ.get("NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
== "true" == "true"
) )

View File

@ -1,16 +1,16 @@
from collections.abc import Generator from collections.abc import Generator
from dataclasses import dataclass
from dataclasses import fields
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
from typing import cast
from typing import Optional from typing import Optional
import requests import requests
from pydantic import BaseModel
from retry import retry from retry import retry
from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.app_configs import NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP from onyx.configs.app_configs import NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP
from onyx.configs.constants import DocumentSource from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import ( from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
rl_requests, rl_requests,
@ -25,6 +25,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document from onyx.connectors.models import Document
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TextSection from onyx.connectors.models import TextSection
from onyx.utils.batching import batch_generator from onyx.utils.batching import batch_generator
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
@ -38,8 +39,7 @@ _NOTION_CALL_TIMEOUT = 30 # 30 seconds
# TODO: Tables need to be ingested, Pages need to have their metadata ingested # TODO: Tables need to be ingested, Pages need to have their metadata ingested
@dataclass class NotionPage(BaseModel):
class NotionPage:
"""Represents a Notion Page object""" """Represents a Notion Page object"""
id: str id: str
@ -49,17 +49,10 @@ class NotionPage:
properties: dict[str, Any] properties: dict[str, Any]
url: str url: str
database_name: str | None # Only applicable to the database type page (wiki) database_name: str | None = None # Only applicable to the database type page (wiki)
def __init__(self, **kwargs: dict[str, Any]) -> None:
names = set([f.name for f in fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)
@dataclass class NotionBlock(BaseModel):
class NotionBlock:
"""Represents a Notion Block object""" """Represents a Notion Block object"""
id: str # Used for the URL id: str # Used for the URL
@ -69,20 +62,13 @@ class NotionBlock:
prefix: str prefix: str
@dataclass class NotionSearchResponse(BaseModel):
class NotionSearchResponse:
"""Represents the response from the Notion Search API""" """Represents the response from the Notion Search API"""
results: list[dict[str, Any]] results: list[dict[str, Any]]
next_cursor: Optional[str] next_cursor: Optional[str]
has_more: bool = False has_more: bool = False
def __init__(self, **kwargs: dict[str, Any]) -> None:
names = set([f.name for f in fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)
class NotionConnector(LoadConnector, PollConnector): class NotionConnector(LoadConnector, PollConnector):
"""Notion Page connector that reads all Notion pages """Notion Page connector that reads all Notion pages
@ -95,7 +81,7 @@ class NotionConnector(LoadConnector, PollConnector):
def __init__( def __init__(
self, self,
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
recursive_index_enabled: bool = NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP, recursive_index_enabled: bool = not NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP,
root_page_id: str | None = None, root_page_id: str | None = None,
) -> None: ) -> None:
"""Initialize with parameters.""" """Initialize with parameters."""
@ -464,23 +450,53 @@ class NotionConnector(LoadConnector, PollConnector):
page_blocks, child_page_ids = self._read_blocks(page.id) page_blocks, child_page_ids = self._read_blocks(page.id)
all_child_page_ids.extend(child_page_ids) all_child_page_ids.extend(child_page_ids)
# okay to mark here since there's no way for this to not succeed
# without a critical failure
self.indexed_pages.add(page.id)
raw_page_title = self._read_page_title(page)
page_title = raw_page_title or f"Untitled Page with ID {page.id}"
if not page_blocks: if not page_blocks:
if not raw_page_title:
logger.warning(
f"No blocks OR title found for page with ID '{page.id}'. Skipping."
)
continue continue
page_title = ( logger.debug(f"No blocks found for page with ID '{page.id}'")
self._read_page_title(page) or f"Untitled Page with ID {page.id}" """
) Something like:
yield ( TITLE
Document(
id=page.id, PROP1: PROP1_VALUE
sections=[ PROP2: PROP2_VALUE
"""
text = page_title
if page.properties:
text += "\n\n" + "\n".join(
[f"{key}: {value}" for key, value in page.properties.items()]
)
sections = [
TextSection(
link=f"{page.url}",
text=text,
)
]
else:
sections = [
TextSection( TextSection(
link=f"{page.url}#{block.id.replace('-', '')}", link=f"{page.url}#{block.id.replace('-', '')}",
text=block.prefix + block.text, text=block.prefix + block.text,
) )
for block in page_blocks for block in page_blocks
], ]
yield (
Document(
id=page.id,
sections=cast(list[TextSection | ImageSection], sections),
source=DocumentSource.NOTION, source=DocumentSource.NOTION,
semantic_identifier=page_title, semantic_identifier=page_title,
doc_updated_at=datetime.fromisoformat( doc_updated_at=datetime.fromisoformat(