mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-15 10:30:53 +02:00
632 lines
25 KiB
Python
632 lines
25 KiB
Python
import time
|
|
from collections.abc import Generator
|
|
from dataclasses import dataclass
|
|
from dataclasses import fields
|
|
from datetime import datetime
|
|
from datetime import timezone
|
|
from typing import Any
|
|
from typing import Optional
|
|
|
|
from retry import retry
|
|
|
|
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
|
from onyx.configs.app_configs import NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
|
|
rl_requests,
|
|
)
|
|
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
|
from onyx.connectors.interfaces import LoadConnector
|
|
from onyx.connectors.interfaces import PollConnector
|
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
|
from onyx.connectors.models import Document
|
|
from onyx.connectors.models import Section
|
|
from onyx.utils.batching import batch_generator
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
_NOTION_CALL_TIMEOUT = 30 # 30 seconds
|
|
|
|
|
|
# TODO: Tables need to be ingested, Pages need to have their metadata ingested
|
|
|
|
|
|
@dataclass
|
|
class NotionPage:
|
|
"""Represents a Notion Page object"""
|
|
|
|
id: str
|
|
created_time: str
|
|
last_edited_time: str
|
|
archived: bool
|
|
properties: dict[str, Any]
|
|
url: str
|
|
|
|
database_name: str | None # Only applicable to the database type page (wiki)
|
|
|
|
def __init__(self, **kwargs: dict[str, Any]) -> None:
|
|
names = set([f.name for f in fields(self)])
|
|
for k, v in kwargs.items():
|
|
if k in names:
|
|
setattr(self, k, v)
|
|
|
|
|
|
@dataclass
|
|
class NotionBlock:
|
|
"""Represents a Notion Block object"""
|
|
|
|
id: str # Used for the URL
|
|
text: str
|
|
# In a plaintext representation of the page, how this block should be joined
|
|
# with the existing text up to this point, separated out from text for clarity
|
|
prefix: str
|
|
|
|
|
|
@dataclass
|
|
class NotionSearchResponse:
|
|
"""Represents the response from the Notion Search API"""
|
|
|
|
results: list[dict[str, Any]]
|
|
next_cursor: Optional[str]
|
|
has_more: bool = False
|
|
|
|
def __init__(self, **kwargs: dict[str, Any]) -> None:
|
|
names = set([f.name for f in fields(self)])
|
|
for k, v in kwargs.items():
|
|
if k in names:
|
|
setattr(self, k, v)
|
|
|
|
|
|
class NotionConnector(LoadConnector, PollConnector):
|
|
"""Notion Page connector that reads all Notion pages
|
|
this integration has been granted access to.
|
|
|
|
Arguments:
|
|
batch_size (int): Number of objects to index in a batch
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
batch_size: int = INDEX_BATCH_SIZE,
|
|
recursive_index_enabled: bool = NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP,
|
|
root_page_id: str | None = None,
|
|
) -> None:
|
|
"""Initialize with parameters."""
|
|
self.batch_size = batch_size
|
|
self.headers = {
|
|
"Content-Type": "application/json",
|
|
"Notion-Version": "2022-06-28",
|
|
}
|
|
self.indexed_pages: set[str] = set()
|
|
self.root_page_id = root_page_id
|
|
# if enabled, will recursively index child pages as they are found rather
|
|
# relying entirely on the `search` API. We have received reports that the
|
|
# `search` API misses many pages - in those cases, this might need to be
|
|
# turned on. It's not currently known why/when this is required.
|
|
# NOTE: this also removes all benefits polling, since we need to traverse
|
|
# all pages regardless of if they are updated. If the notion workspace is
|
|
# very large, this may not be practical.
|
|
self.recursive_index_enabled = recursive_index_enabled or self.root_page_id
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _fetch_child_blocks(
|
|
self, block_id: str, cursor: str | None = None
|
|
) -> dict[str, Any] | None:
|
|
"""Fetch all child blocks via the Notion API."""
|
|
logger.debug(f"Fetching children of block with ID '{block_id}'")
|
|
block_url = f"https://api.notion.com/v1/blocks/{block_id}/children"
|
|
query_params = None if not cursor else {"start_cursor": cursor}
|
|
res = rl_requests.get(
|
|
block_url,
|
|
headers=self.headers,
|
|
params=query_params,
|
|
timeout=_NOTION_CALL_TIMEOUT,
|
|
)
|
|
try:
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
if res.status_code == 404:
|
|
# this happens when a page is not shared with the integration
|
|
# in this case, we should just ignore the page
|
|
logger.error(
|
|
f"Unable to access block with ID '{block_id}'. "
|
|
f"This is likely due to the block not being shared "
|
|
f"with the Onyx integration. Exact exception:\n\n{e}"
|
|
)
|
|
else:
|
|
logger.exception(
|
|
f"Error fetching blocks with status code {res.status_code}: {res.json()}"
|
|
)
|
|
|
|
# This can occasionally happen, the reason is unknown and cannot be reproduced on our internal Notion
|
|
# Assuming this will not be a critical loss of data
|
|
return None
|
|
return res.json()
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _fetch_page(self, page_id: str) -> NotionPage:
|
|
"""Fetch a page from its ID via the Notion API, retry with database if page fetch fails."""
|
|
logger.debug(f"Fetching page for ID '{page_id}'")
|
|
page_url = f"https://api.notion.com/v1/pages/{page_id}"
|
|
res = rl_requests.get(
|
|
page_url,
|
|
headers=self.headers,
|
|
timeout=_NOTION_CALL_TIMEOUT,
|
|
)
|
|
try:
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Failed to fetch page, trying database for ID '{page_id}'. Exception: {e}"
|
|
)
|
|
# Try fetching as a database if page fetch fails, this happens if the page is set to a wiki
|
|
# it becomes a database from the notion perspective
|
|
return self._fetch_database_as_page(page_id)
|
|
return NotionPage(**res.json())
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _fetch_database_as_page(self, database_id: str) -> NotionPage:
|
|
"""Attempt to fetch a database as a page."""
|
|
logger.debug(f"Fetching database for ID '{database_id}' as a page")
|
|
database_url = f"https://api.notion.com/v1/databases/{database_id}"
|
|
res = rl_requests.get(
|
|
database_url,
|
|
headers=self.headers,
|
|
timeout=_NOTION_CALL_TIMEOUT,
|
|
)
|
|
try:
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
logger.exception(f"Error fetching database as page - {res.json()}")
|
|
raise e
|
|
database_name = res.json().get("title")
|
|
database_name = (
|
|
database_name[0].get("text", {}).get("content") if database_name else None
|
|
)
|
|
|
|
return NotionPage(**res.json(), database_name=database_name)
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _fetch_database(
|
|
self, database_id: str, cursor: str | None = None
|
|
) -> dict[str, Any]:
|
|
"""Fetch a database from it's ID via the Notion API."""
|
|
logger.debug(f"Fetching database for ID '{database_id}'")
|
|
block_url = f"https://api.notion.com/v1/databases/{database_id}/query"
|
|
body = None if not cursor else {"start_cursor": cursor}
|
|
res = rl_requests.post(
|
|
block_url,
|
|
headers=self.headers,
|
|
json=body,
|
|
timeout=_NOTION_CALL_TIMEOUT,
|
|
)
|
|
try:
|
|
res.raise_for_status()
|
|
except Exception as e:
|
|
if res.json().get("code") == "object_not_found":
|
|
# this happens when a database is not shared with the integration
|
|
# in this case, we should just ignore the database
|
|
logger.error(
|
|
f"Unable to access database with ID '{database_id}'. "
|
|
f"This is likely due to the database not being shared "
|
|
f"with the Onyx integration. Exact exception:\n{e}"
|
|
)
|
|
return {"results": [], "next_cursor": None}
|
|
logger.exception(f"Error fetching database - {res.json()}")
|
|
raise e
|
|
return res.json()
|
|
|
|
@staticmethod
|
|
def _properties_to_str(properties: dict[str, Any]) -> str:
|
|
"""Converts Notion properties to a string"""
|
|
|
|
def _recurse_properties(inner_dict: dict[str, Any]) -> str | None:
|
|
while "type" in inner_dict:
|
|
type_name = inner_dict["type"]
|
|
inner_dict = inner_dict[type_name]
|
|
|
|
# If the innermost layer is None, the value is not set
|
|
if not inner_dict:
|
|
return None
|
|
|
|
if isinstance(inner_dict, list):
|
|
list_properties = [
|
|
_recurse_properties(item) for item in inner_dict if item
|
|
]
|
|
return (
|
|
", ".join(
|
|
[
|
|
list_property
|
|
for list_property in list_properties
|
|
if list_property
|
|
]
|
|
)
|
|
or None
|
|
)
|
|
|
|
# TODO there may be more types to handle here
|
|
if isinstance(inner_dict, str):
|
|
# For some objects the innermost value could just be a string, not sure what causes this
|
|
return inner_dict
|
|
|
|
elif isinstance(inner_dict, dict):
|
|
if "name" in inner_dict:
|
|
return inner_dict["name"]
|
|
if "content" in inner_dict:
|
|
return inner_dict["content"]
|
|
start = inner_dict.get("start")
|
|
end = inner_dict.get("end")
|
|
if start is not None:
|
|
if end is not None:
|
|
return f"{start} - {end}"
|
|
return start
|
|
elif end is not None:
|
|
return f"Until {end}"
|
|
|
|
if "id" in inner_dict:
|
|
# This is not useful to index, it's a reference to another Notion object
|
|
# and this ID value in plaintext is useless outside of the Notion context
|
|
logger.debug("Skipping Notion object id field property")
|
|
return None
|
|
|
|
logger.debug(f"Unreadable property from innermost prop: {inner_dict}")
|
|
return None
|
|
|
|
result = ""
|
|
for prop_name, prop in properties.items():
|
|
if not prop:
|
|
continue
|
|
|
|
try:
|
|
inner_value = _recurse_properties(prop)
|
|
except Exception as e:
|
|
# This is not a critical failure, these properties are not the actual contents of the page
|
|
# more similar to metadata
|
|
logger.warning(f"Error recursing properties for {prop_name}: {e}")
|
|
continue
|
|
# Not a perfect way to format Notion database tables but there's no perfect representation
|
|
# since this must be represented as plaintext
|
|
if inner_value:
|
|
result += f"{prop_name}: {inner_value}\t"
|
|
|
|
return result
|
|
|
|
def _read_pages_from_database(
|
|
self, database_id: str
|
|
) -> tuple[list[NotionBlock], list[str]]:
|
|
"""Returns a list of top level blocks and all page IDs in the database"""
|
|
result_blocks: list[NotionBlock] = []
|
|
result_pages: list[str] = []
|
|
cursor = None
|
|
while True:
|
|
data = self._fetch_database(database_id, cursor)
|
|
|
|
for result in data["results"]:
|
|
obj_id = result["id"]
|
|
obj_type = result["object"]
|
|
text = self._properties_to_str(result.get("properties", {}))
|
|
if text:
|
|
result_blocks.append(NotionBlock(id=obj_id, text=text, prefix="\n"))
|
|
|
|
if self.recursive_index_enabled:
|
|
if obj_type == "page":
|
|
logger.debug(
|
|
f"Found page with ID '{obj_id}' in database '{database_id}'"
|
|
)
|
|
result_pages.append(result["id"])
|
|
elif obj_type == "database":
|
|
logger.debug(
|
|
f"Found database with ID '{obj_id}' in database '{database_id}'"
|
|
)
|
|
# The inner contents are ignored at this level
|
|
_, child_pages = self._read_pages_from_database(obj_id)
|
|
result_pages.extend(child_pages)
|
|
|
|
if data["next_cursor"] is None:
|
|
break
|
|
|
|
cursor = data["next_cursor"]
|
|
|
|
return result_blocks, result_pages
|
|
|
|
def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]:
|
|
"""Reads all child blocks for the specified block, returns a list of blocks and child page ids"""
|
|
result_blocks: list[NotionBlock] = []
|
|
child_pages: list[str] = []
|
|
cursor = None
|
|
while True:
|
|
data = self._fetch_child_blocks(base_block_id, cursor)
|
|
|
|
# this happens when a block is not shared with the integration
|
|
if data is None:
|
|
return result_blocks, child_pages
|
|
|
|
for result in data["results"]:
|
|
logger.debug(
|
|
f"Found child block for block with ID '{base_block_id}': {result}"
|
|
)
|
|
result_block_id = result["id"]
|
|
result_type = result["type"]
|
|
result_obj = result[result_type]
|
|
|
|
if result_type == "ai_block":
|
|
logger.warning(
|
|
f"Skipping 'ai_block' ('{result_block_id}') for base block '{base_block_id}': "
|
|
f"Notion API does not currently support reading AI blocks (as of 24/02/09) "
|
|
f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1053)"
|
|
)
|
|
continue
|
|
|
|
if result_type == "unsupported":
|
|
logger.warning(
|
|
f"Skipping unsupported block type '{result_type}' "
|
|
f"('{result_block_id}') for base block '{base_block_id}': "
|
|
f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1230)"
|
|
)
|
|
continue
|
|
|
|
if result_type == "external_object_instance_page":
|
|
logger.warning(
|
|
f"Skipping 'external_object_instance_page' ('{result_block_id}') for base block '{base_block_id}': "
|
|
f"Notion API does not currently support reading external blocks (as of 24/07/03) "
|
|
f"(discussion: https://github.com/onyx-dot-app/onyx/issues/1761)"
|
|
)
|
|
continue
|
|
|
|
cur_result_text_arr = []
|
|
if "rich_text" in result_obj:
|
|
for rich_text in result_obj["rich_text"]:
|
|
# skip if doesn't have text object
|
|
if "text" in rich_text:
|
|
text = rich_text["text"]["content"]
|
|
cur_result_text_arr.append(text)
|
|
|
|
if result["has_children"]:
|
|
if result_type == "child_page":
|
|
# Child pages will not be included at this top level, it will be a separate document
|
|
child_pages.append(result_block_id)
|
|
else:
|
|
logger.debug(f"Entering sub-block: {result_block_id}")
|
|
subblocks, subblock_child_pages = self._read_blocks(
|
|
result_block_id
|
|
)
|
|
logger.debug(f"Finished sub-block: {result_block_id}")
|
|
result_blocks.extend(subblocks)
|
|
child_pages.extend(subblock_child_pages)
|
|
|
|
if result_type == "child_database":
|
|
inner_blocks, inner_child_pages = self._read_pages_from_database(
|
|
result_block_id
|
|
)
|
|
# A database on a page often looks like a table, we need to include it for the contents
|
|
# of the page but the children (cells) should be processed as other Documents
|
|
result_blocks.extend(inner_blocks)
|
|
|
|
if self.recursive_index_enabled:
|
|
child_pages.extend(inner_child_pages)
|
|
|
|
if cur_result_text_arr:
|
|
new_block = NotionBlock(
|
|
id=result_block_id,
|
|
text="\n".join(cur_result_text_arr),
|
|
prefix="\n",
|
|
)
|
|
result_blocks.append(new_block)
|
|
|
|
if data["next_cursor"] is None:
|
|
break
|
|
|
|
cursor = data["next_cursor"]
|
|
|
|
return result_blocks, child_pages
|
|
|
|
def _read_page_title(self, page: NotionPage) -> str | None:
|
|
"""Extracts the title from a Notion page"""
|
|
page_title = None
|
|
if hasattr(page, "database_name") and page.database_name:
|
|
return page.database_name
|
|
for _, prop in page.properties.items():
|
|
if prop["type"] == "title" and len(prop["title"]) > 0:
|
|
page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip()
|
|
break
|
|
|
|
return page_title
|
|
|
|
def _read_pages(
|
|
self,
|
|
pages: list[NotionPage],
|
|
) -> Generator[Document, None, None]:
|
|
"""Reads pages for rich text content and generates Documents
|
|
|
|
Note that a page which is turned into a "wiki" becomes a database but both top level pages and top level databases
|
|
do not seem to have any properties associated with them.
|
|
|
|
Pages that are part of a database can have properties which are like the values of the row in the "database" table
|
|
in which they exist
|
|
|
|
This is not clearly outlined in the Notion API docs but it is observable empirically.
|
|
https://developers.notion.com/docs/working-with-page-content
|
|
"""
|
|
all_child_page_ids: list[str] = []
|
|
for page in pages:
|
|
if page.id in self.indexed_pages:
|
|
logger.debug(f"Already indexed page with ID '{page.id}'. Skipping.")
|
|
continue
|
|
|
|
logger.info(f"Reading page with ID '{page.id}', with url {page.url}")
|
|
page_blocks, child_page_ids = self._read_blocks(page.id)
|
|
all_child_page_ids.extend(child_page_ids)
|
|
|
|
if not page_blocks:
|
|
continue
|
|
|
|
page_title = (
|
|
self._read_page_title(page) or f"Untitled Page with ID {page.id}"
|
|
)
|
|
|
|
yield (
|
|
Document(
|
|
id=page.id,
|
|
sections=[
|
|
Section(
|
|
link=f"{page.url}#{block.id.replace('-', '')}",
|
|
text=block.prefix + block.text,
|
|
)
|
|
for block in page_blocks
|
|
],
|
|
source=DocumentSource.NOTION,
|
|
semantic_identifier=page_title,
|
|
doc_updated_at=datetime.fromisoformat(
|
|
page.last_edited_time
|
|
).astimezone(timezone.utc),
|
|
metadata={},
|
|
)
|
|
)
|
|
self.indexed_pages.add(page.id)
|
|
|
|
if self.recursive_index_enabled and all_child_page_ids:
|
|
# NOTE: checking if page_id is in self.indexed_pages to prevent extra
|
|
# calls to `_fetch_page` for pages we've already indexed
|
|
for child_page_batch_ids in batch_generator(
|
|
all_child_page_ids, batch_size=INDEX_BATCH_SIZE
|
|
):
|
|
child_page_batch = [
|
|
self._fetch_page(page_id)
|
|
for page_id in child_page_batch_ids
|
|
if page_id not in self.indexed_pages
|
|
]
|
|
yield from self._read_pages(child_page_batch)
|
|
|
|
@retry(tries=3, delay=1, backoff=2)
|
|
def _search_notion(self, query_dict: dict[str, Any]) -> NotionSearchResponse:
|
|
"""Search for pages from a Notion database. Includes some small number of
|
|
retries to handle misc, flakey failures."""
|
|
logger.debug(f"Searching for pages in Notion with query_dict: {query_dict}")
|
|
res = rl_requests.post(
|
|
"https://api.notion.com/v1/search",
|
|
headers=self.headers,
|
|
json=query_dict,
|
|
timeout=_NOTION_CALL_TIMEOUT,
|
|
)
|
|
res.raise_for_status()
|
|
return NotionSearchResponse(**res.json())
|
|
|
|
def _filter_pages_by_time(
|
|
self,
|
|
pages: list[dict[str, Any]],
|
|
start: SecondsSinceUnixEpoch,
|
|
end: SecondsSinceUnixEpoch,
|
|
filter_field: str = "last_edited_time",
|
|
) -> list[NotionPage]:
|
|
"""A helper function to filter out pages outside of a time
|
|
range. This functionality doesn't yet exist in the Notion Search API,
|
|
but when it does, this approach can be deprecated.
|
|
|
|
Arguments:
|
|
pages (list[dict]) - Pages to filter
|
|
start (float) - start epoch time to filter from
|
|
end (float) - end epoch time to filter to
|
|
filter_field (str) - the attribute on the page to apply the filter
|
|
"""
|
|
filtered_pages: list[NotionPage] = []
|
|
for page in pages:
|
|
compare_time = time.mktime(
|
|
time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.000Z")
|
|
)
|
|
if compare_time > start and compare_time <= end:
|
|
filtered_pages += [NotionPage(**page)]
|
|
return filtered_pages
|
|
|
|
def _recursive_load(self) -> Generator[list[Document], None, None]:
|
|
if self.root_page_id is None or not self.recursive_index_enabled:
|
|
raise RuntimeError(
|
|
"Recursive page lookup is not enabled, but we are trying to "
|
|
"recursively load pages. This should never happen."
|
|
)
|
|
|
|
logger.info(
|
|
"Recursively loading pages from Notion based on root page with "
|
|
f"ID: {self.root_page_id}"
|
|
)
|
|
pages = [self._fetch_page(page_id=self.root_page_id)]
|
|
yield from batch_generator(self._read_pages(pages), self.batch_size)
|
|
|
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
|
"""Applies integration token to headers"""
|
|
self.headers[
|
|
"Authorization"
|
|
] = f'Bearer {credentials["notion_integration_token"]}'
|
|
return None
|
|
|
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
"""Loads all page data from a Notion workspace.
|
|
|
|
Returns:
|
|
list[Document]: list of documents.
|
|
"""
|
|
# TODO: remove once Notion search issue is discovered
|
|
if self.recursive_index_enabled and self.root_page_id:
|
|
yield from self._recursive_load()
|
|
return
|
|
|
|
query_dict = {
|
|
"filter": {"property": "object", "value": "page"},
|
|
"page_size": self.batch_size,
|
|
}
|
|
while True:
|
|
db_res = self._search_notion(query_dict)
|
|
pages = [NotionPage(**page) for page in db_res.results]
|
|
yield from batch_generator(self._read_pages(pages), self.batch_size)
|
|
if db_res.has_more:
|
|
query_dict["start_cursor"] = db_res.next_cursor
|
|
else:
|
|
break
|
|
|
|
def poll_source(
|
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
|
) -> GenerateDocumentsOutput:
|
|
"""Uses the Notion search API to fetch updated pages
|
|
within a time period.
|
|
Unfortunately the search API doesn't yet support filtering by times,
|
|
so until they add that, we're just going to page through results until,
|
|
we reach ones that are older than our search criteria.
|
|
"""
|
|
# TODO: remove once Notion search issue is discovered
|
|
if self.recursive_index_enabled and self.root_page_id:
|
|
yield from self._recursive_load()
|
|
return
|
|
|
|
query_dict = {
|
|
"page_size": self.batch_size,
|
|
"sort": {"timestamp": "last_edited_time", "direction": "descending"},
|
|
"filter": {"property": "object", "value": "page"},
|
|
}
|
|
while True:
|
|
db_res = self._search_notion(query_dict)
|
|
pages = self._filter_pages_by_time(
|
|
db_res.results, start, end, filter_field="last_edited_time"
|
|
)
|
|
if len(pages) > 0:
|
|
yield from batch_generator(self._read_pages(pages), self.batch_size)
|
|
if db_res.has_more:
|
|
query_dict["start_cursor"] = db_res.next_cursor
|
|
else:
|
|
break
|
|
else:
|
|
break
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import os
|
|
|
|
root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
|
|
connector = NotionConnector(root_page_id=root_page_id)
|
|
connector.load_credentials(
|
|
{"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
|
|
)
|
|
document_batches = connector.load_from_state()
|
|
for doc_batch in document_batches:
|
|
for doc in doc_batch:
|
|
print(doc)
|