Add more logging for notion connector + add retries

This commit is contained in:
Weves
2023-09-06 11:32:34 -07:00
committed by Chris Weaver
parent 6a79ddce37
commit 78e1806688

View File

@@ -7,6 +7,7 @@ from typing import List
from typing import Optional from typing import Optional
import requests import requests
from retry import retry
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
@@ -16,6 +17,9 @@ from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
@dataclass @dataclass
@@ -68,19 +72,23 @@ class NotionConnector(LoadConnector, PollConnector):
"Notion-Version": "2022-06-28", "Notion-Version": "2022-06-28",
} }
@retry(tries=3, delay=1, backoff=2)
def _fetch_block(self, block_id: str) -> dict[str, Any]:
"""Fetch a single block via the Notion API."""
logger.debug(f"Fetching block with ID '{block_id}'")
block_url = f"https://api.notion.com/v1/blocks/{block_id}/children"
query_dict: Dict[str, Any] = {}
res = requests.get(block_url, headers=self.headers, json=query_dict)
res.raise_for_status()
return res.json()
def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str: def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str:
"""Reads blocks for a page""" """Reads blocks for a page"""
done = False done = False
result_lines_arr = [] result_lines_arr = []
cur_block_id = block_id cur_block_id = block_id
while not done: while not done:
block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children" data = self._fetch_block(cur_block_id)
query_dict: Dict[str, Any] = {}
res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
for result in data["results"]: for result in data["results"]:
result_type = result["type"] result_type = result["type"]
@@ -130,6 +138,7 @@ class NotionConnector(LoadConnector, PollConnector):
"""Reads pages for rich text content and generates Documents""" """Reads pages for rich text content and generates Documents"""
docs_batch = [] docs_batch = []
for page in pages: for page in pages:
logger.info(f"Reading page with ID '{page.id}', with url {page.url}")
page_text = self._read_blocks(page.id) page_text = self._read_blocks(page.id)
page_title = self._read_page_title(page) page_title = self._read_page_title(page)
docs_batch.append( docs_batch.append(
@@ -143,8 +152,11 @@ class NotionConnector(LoadConnector, PollConnector):
) )
return docs_batch return docs_batch
@retry(tries=3, delay=1, backoff=2)
def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse: def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse:
"""Search for pages from a Notion database.""" """Search for pages from a Notion database. Includes some small number of
retries to handle misc, flakey failures."""
logger.debug(f"Searching for pages in Notion with query_dict: {query_dict}")
res = requests.post( res = requests.post(
"https://api.notion.com/v1/search", "https://api.notion.com/v1/search",
headers=self.headers, headers=self.headers,