import time from collections.abc import Generator from dataclasses import dataclass from dataclasses import fields from datetime import datetime from datetime import timezone from typing import Any from typing import Optional from retry import retry from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.app_configs import NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP from onyx.configs.constants import DocumentSource from onyx.connectors.cross_connector_utils.rate_limit_wrapper import ( rl_requests, ) from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import Document from onyx.connectors.models import Section from onyx.utils.batching import batch_generator from onyx.utils.logger import setup_logger logger = setup_logger() _NOTION_CALL_TIMEOUT = 30 # 30 seconds # TODO: Tables need to be ingested, Pages need to have their metadata ingested @dataclass class NotionPage: """Represents a Notion Page object""" id: str created_time: str last_edited_time: str archived: bool properties: dict[str, Any] url: str database_name: str | None # Only applicable to the database type page (wiki) def __init__(self, **kwargs: dict[str, Any]) -> None: names = set([f.name for f in fields(self)]) for k, v in kwargs.items(): if k in names: setattr(self, k, v) @dataclass class NotionBlock: """Represents a Notion Block object""" id: str # Used for the URL text: str # In a plaintext representation of the page, how this block should be joined # with the existing text up to this point, separated out from text for clarity prefix: str @dataclass class NotionSearchResponse: """Represents the response from the Notion Search API""" results: list[dict[str, Any]] next_cursor: Optional[str] has_more: bool = False def __init__(self, **kwargs: dict[str, Any]) -> None: names = set([f.name for f in fields(self)]) for k, v in kwargs.items(): if k in names: setattr(self, k, v) class NotionConnector(LoadConnector, PollConnector): """Notion Page connector that reads all Notion pages this integration has been granted access to. Arguments: batch_size (int): Number of objects to index in a batch """ def __init__( self, batch_size: int = INDEX_BATCH_SIZE, recursive_index_enabled: bool = NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP, root_page_id: str | None = None, ) -> None: """Initialize with parameters.""" self.batch_size = batch_size self.headers = { "Content-Type": "application/json", "Notion-Version": "2022-06-28", } self.indexed_pages: set[str] = set() self.root_page_id = root_page_id # if enabled, will recursively index child pages as they are found rather # relying entirely on the `search` API. We have received reports that the # `search` API misses many pages - in those cases, this might need to be # turned on. It's not currently known why/when this is required. # NOTE: this also removes all benefits polling, since we need to traverse # all pages regardless of if they are updated. If the notion workspace is # very large, this may not be practical. self.recursive_index_enabled = recursive_index_enabled or self.root_page_id @retry(tries=3, delay=1, backoff=2) def _fetch_child_blocks( self, block_id: str, cursor: str | None = None ) -> dict[str, Any] | None: """Fetch all child blocks via the Notion API.""" logger.debug(f"Fetching children of block with ID '{block_id}'") block_url = f"https://api.notion.com/v1/blocks/{block_id}/children" query_params = None if not cursor else {"start_cursor": cursor} res = rl_requests.get( block_url, headers=self.headers, params=query_params, timeout=_NOTION_CALL_TIMEOUT, ) try: res.raise_for_status() except Exception as e: if res.status_code == 404: # this happens when a page is not shared with the integration # in this case, we should just ignore the page logger.error( f"Unable to access block with ID '{block_id}'. " f"This is likely due to the block not being shared " f"with the Onyx integration. Exact exception:\n\n{e}" ) else: logger.exception( f"Error fetching blocks with status code {res.status_code}: {res.json()}" ) # This can occasionally happen, the reason is unknown and cannot be reproduced on our internal Notion # Assuming this will not be a critical loss of data return None return res.json() @retry(tries=3, delay=1, backoff=2) def _fetch_page(self, page_id: str) -> NotionPage: """Fetch a page from its ID via the Notion API, retry with database if page fetch fails.""" logger.debug(f"Fetching page for ID '{page_id}'") page_url = f"https://api.notion.com/v1/pages/{page_id}" res = rl_requests.get( page_url, headers=self.headers, timeout=_NOTION_CALL_TIMEOUT, ) try: res.raise_for_status() except Exception as e: logger.warning( f"Failed to fetch page, trying database for ID '{page_id}'. Exception: {e}" ) # Try fetching as a database if page fetch fails, this happens if the page is set to a wiki # it becomes a database from the notion perspective return self._fetch_database_as_page(page_id) return NotionPage(**res.json()) @retry(tries=3, delay=1, backoff=2) def _fetch_database_as_page(self, database_id: str) -> NotionPage: """Attempt to fetch a database as a page.""" logger.debug(f"Fetching database for ID '{database_id}' as a page") database_url = f"https://api.notion.com/v1/databases/{database_id}" res = rl_requests.get( database_url, headers=self.headers, timeout=_NOTION_CALL_TIMEOUT, ) try: res.raise_for_status() except Exception as e: logger.exception(f"Error fetching database as page - {res.json()}") raise e database_name = res.json().get("title") database_name = ( database_name[0].get("text", {}).get("content") if database_name else None ) return NotionPage(**res.json(), database_name=database_name) @retry(tries=3, delay=1, backoff=2) def _fetch_database( self, database_id: str, cursor: str | None = None ) -> dict[str, Any]: """Fetch a database from it's ID via the Notion API.""" logger.debug(f"Fetching database for ID '{database_id}'") block_url = f"https://api.notion.com/v1/databases/{database_id}/query" body = None if not cursor else {"start_cursor": cursor} res = rl_requests.post( block_url, headers=self.headers, json=body, timeout=_NOTION_CALL_TIMEOUT, ) try: res.raise_for_status() except Exception as e: if res.json().get("code") == "object_not_found": # this happens when a database is not shared with the integration # in this case, we should just ignore the database logger.error( f"Unable to access database with ID '{database_id}'. " f"This is likely due to the database not being shared " f"with the Onyx integration. Exact exception:\n{e}" ) return {"results": [], "next_cursor": None} logger.exception(f"Error fetching database - {res.json()}") raise e return res.json() @staticmethod def _properties_to_str(properties: dict[str, Any]) -> str: """Converts Notion properties to a string""" def _recurse_properties(inner_dict: dict[str, Any]) -> str | None: while "type" in inner_dict: type_name = inner_dict["type"] inner_dict = inner_dict[type_name] # If the innermost layer is None, the value is not set if not inner_dict: return None if isinstance(inner_dict, list): list_properties = [ _recurse_properties(item) for item in inner_dict if item ] return ( ", ".join( [ list_property for list_property in list_properties if list_property ] ) or None ) # TODO there may be more types to handle here if isinstance(inner_dict, str): # For some objects the innermost value could just be a string, not sure what causes this return inner_dict elif isinstance(inner_dict, dict): if "name" in inner_dict: return inner_dict["name"] if "content" in inner_dict: return inner_dict["content"] start = inner_dict.get("start") end = inner_dict.get("end") if start is not None: if end is not None: return f"{start} - {end}" return start elif end is not None: return f"Until {end}" if "id" in inner_dict: # This is not useful to index, it's a reference to another Notion object # and this ID value in plaintext is useless outside of the Notion context logger.debug("Skipping Notion object id field property") return None logger.debug(f"Unreadable property from innermost prop: {inner_dict}") return None result = "" for prop_name, prop in properties.items(): if not prop: continue try: inner_value = _recurse_properties(prop) except Exception as e: # This is not a critical failure, these properties are not the actual contents of the page # more similar to metadata logger.warning(f"Error recursing properties for {prop_name}: {e}") continue # Not a perfect way to format Notion database tables but there's no perfect representation # since this must be represented as plaintext if inner_value: result += f"{prop_name}: {inner_value}\t" return result def _read_pages_from_database( self, database_id: str ) -> tuple[list[NotionBlock], list[str]]: """Returns a list of top level blocks and all page IDs in the database""" result_blocks: list[NotionBlock] = [] result_pages: list[str] = [] cursor = None while True: data = self._fetch_database(database_id, cursor) for result in data["results"]: obj_id = result["id"] obj_type = result["object"] text = self._properties_to_str(result.get("properties", {})) if text: result_blocks.append(NotionBlock(id=obj_id, text=text, prefix="\n")) if self.recursive_index_enabled: if obj_type == "page": logger.debug( f"Found page with ID '{obj_id}' in database '{database_id}'" ) result_pages.append(result["id"]) elif obj_type == "database": logger.debug( f"Found database with ID '{obj_id}' in database '{database_id}'" ) # The inner contents are ignored at this level _, child_pages = self._read_pages_from_database(obj_id) result_pages.extend(child_pages) if data["next_cursor"] is None: break cursor = data["next_cursor"] return result_blocks, result_pages def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]: """Reads all child blocks for the specified block, returns a list of blocks and child page ids""" result_blocks: list[NotionBlock] = [] child_pages: list[str] = [] cursor = None while True: data = self._fetch_child_blocks(base_block_id, cursor) # this happens when a block is not shared with the integration if data is None: return result_blocks, child_pages for result in data["results"]: logger.debug( f"Found child block for block with ID '{base_block_id}': {result}" ) result_block_id = result["id"] result_type = result["type"] result_obj = result[result_type] if result_type == "ai_block": logger.warning( f"Skipping 'ai_block' ('{result_block_id}') for base block '{base_block_id}': " f"Notion API does not currently support reading AI blocks (as of 24/02/09) " f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1053)" ) continue if result_type == "unsupported": logger.warning( f"Skipping unsupported block type '{result_type}' " f"('{result_block_id}') for base block '{base_block_id}': " f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1230)" ) continue if result_type == "external_object_instance_page": logger.warning( f"Skipping 'external_object_instance_page' ('{result_block_id}') for base block '{base_block_id}': " f"Notion API does not currently support reading external blocks (as of 24/07/03) " f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1761)" ) continue cur_result_text_arr = [] if "rich_text" in result_obj: for rich_text in result_obj["rich_text"]: # skip if doesn't have text object if "text" in rich_text: text = rich_text["text"]["content"] cur_result_text_arr.append(text) if result["has_children"]: if result_type == "child_page": # Child pages will not be included at this top level, it will be a separate document child_pages.append(result_block_id) else: logger.debug(f"Entering sub-block: {result_block_id}") subblocks, subblock_child_pages = self._read_blocks( result_block_id ) logger.debug(f"Finished sub-block: {result_block_id}") result_blocks.extend(subblocks) child_pages.extend(subblock_child_pages) if result_type == "child_database": inner_blocks, inner_child_pages = self._read_pages_from_database( result_block_id ) # A database on a page often looks like a table, we need to include it for the contents # of the page but the children (cells) should be processed as other Documents result_blocks.extend(inner_blocks) if self.recursive_index_enabled: child_pages.extend(inner_child_pages) if cur_result_text_arr: new_block = NotionBlock( id=result_block_id, text="\n".join(cur_result_text_arr), prefix="\n", ) result_blocks.append(new_block) if data["next_cursor"] is None: break cursor = data["next_cursor"] return result_blocks, child_pages def _read_page_title(self, page: NotionPage) -> str | None: """Extracts the title from a Notion page""" page_title = None if hasattr(page, "database_name") and page.database_name: return page.database_name for _, prop in page.properties.items(): if prop["type"] == "title" and len(prop["title"]) > 0: page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip() break return page_title def _read_pages( self, pages: list[NotionPage], ) -> Generator[Document, None, None]: """Reads pages for rich text content and generates Documents Note that a page which is turned into a "wiki" becomes a database but both top level pages and top level databases do not seem to have any properties associated with them. Pages that are part of a database can have properties which are like the values of the row in the "database" table in which they exist This is not clearly outlined in the Notion API docs but it is observable empirically. https://developers.notion.com/docs/working-with-page-content """ all_child_page_ids: list[str] = [] for page in pages: if page.id in self.indexed_pages: logger.debug(f"Already indexed page with ID '{page.id}'. Skipping.") continue logger.info(f"Reading page with ID '{page.id}', with url {page.url}") page_blocks, child_page_ids = self._read_blocks(page.id) all_child_page_ids.extend(child_page_ids) if not page_blocks: continue page_title = ( self._read_page_title(page) or f"Untitled Page with ID {page.id}" ) yield ( Document( id=page.id, sections=[ Section( link=f"{page.url}#{block.id.replace('-', '')}", text=block.prefix + block.text, ) for block in page_blocks ], source=DocumentSource.NOTION, semantic_identifier=page_title, doc_updated_at=datetime.fromisoformat( page.last_edited_time ).astimezone(timezone.utc), metadata={}, ) ) self.indexed_pages.add(page.id) if self.recursive_index_enabled and all_child_page_ids: # NOTE: checking if page_id is in self.indexed_pages to prevent extra # calls to `_fetch_page` for pages we've already indexed for child_page_batch_ids in batch_generator( all_child_page_ids, batch_size=INDEX_BATCH_SIZE ): child_page_batch = [ self._fetch_page(page_id) for page_id in child_page_batch_ids if page_id not in self.indexed_pages ] yield from self._read_pages(child_page_batch) @retry(tries=3, delay=1, backoff=2) def _search_notion(self, query_dict: dict[str, Any]) -> NotionSearchResponse: """Search for pages from a Notion database. Includes some small number of retries to handle misc, flakey failures.""" logger.debug(f"Searching for pages in Notion with query_dict: {query_dict}") res = rl_requests.post( "https://api.notion.com/v1/search", headers=self.headers, json=query_dict, timeout=_NOTION_CALL_TIMEOUT, ) res.raise_for_status() return NotionSearchResponse(**res.json()) def _filter_pages_by_time( self, pages: list[dict[str, Any]], start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch, filter_field: str = "last_edited_time", ) -> list[NotionPage]: """A helper function to filter out pages outside of a time range. This functionality doesn't yet exist in the Notion Search API, but when it does, this approach can be deprecated. Arguments: pages (list[dict]) - Pages to filter start (float) - start epoch time to filter from end (float) - end epoch time to filter to filter_field (str) - the attribute on the page to apply the filter """ filtered_pages: list[NotionPage] = [] for page in pages: compare_time = time.mktime( time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.000Z") ) if compare_time > start and compare_time <= end: filtered_pages += [NotionPage(**page)] return filtered_pages def _recursive_load(self) -> Generator[list[Document], None, None]: if self.root_page_id is None or not self.recursive_index_enabled: raise RuntimeError( "Recursive page lookup is not enabled, but we are trying to " "recursively load pages. This should never happen." ) logger.info( "Recursively loading pages from Notion based on root page with " f"ID: {self.root_page_id}" ) pages = [self._fetch_page(page_id=self.root_page_id)] yield from batch_generator(self._read_pages(pages), self.batch_size) def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: """Applies integration token to headers""" self.headers[ "Authorization" ] = f'Bearer {credentials["notion_integration_token"]}' return None def load_from_state(self) -> GenerateDocumentsOutput: """Loads all page data from a Notion workspace. Returns: list[Document]: list of documents. """ # TODO: remove once Notion search issue is discovered if self.recursive_index_enabled and self.root_page_id: yield from self._recursive_load() return query_dict = { "filter": {"property": "object", "value": "page"}, "page_size": self.batch_size, } while True: db_res = self._search_notion(query_dict) pages = [NotionPage(**page) for page in db_res.results] yield from batch_generator(self._read_pages(pages), self.batch_size) if db_res.has_more: query_dict["start_cursor"] = db_res.next_cursor else: break def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: """Uses the Notion search API to fetch updated pages within a time period. Unfortunately the search API doesn't yet support filtering by times, so until they add that, we're just going to page through results until, we reach ones that are older than our search criteria. """ # TODO: remove once Notion search issue is discovered if self.recursive_index_enabled and self.root_page_id: yield from self._recursive_load() return query_dict = { "page_size": self.batch_size, "sort": {"timestamp": "last_edited_time", "direction": "descending"}, "filter": {"property": "object", "value": "page"}, } while True: db_res = self._search_notion(query_dict) pages = self._filter_pages_by_time( db_res.results, start, end, filter_field="last_edited_time" ) if len(pages) > 0: yield from batch_generator(self._read_pages(pages), self.batch_size) if db_res.has_more: query_dict["start_cursor"] = db_res.next_cursor else: break else: break if __name__ == "__main__": import os root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID") connector = NotionConnector(root_page_id=root_page_id) connector.load_credentials( {"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")} ) document_batches = connector.load_from_state() for doc_batch in document_batches: for doc in doc_batch: print(doc)