mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-11 08:30:51 +02:00
Notion connector backend
This commit is contained in:
parent
3b1a8274a9
commit
4c263b7130
@ -27,3 +27,4 @@ class DocumentSource(str, Enum):
|
|||||||
SLAB = "slab"
|
SLAB = "slab"
|
||||||
JIRA = "jira"
|
JIRA = "jira"
|
||||||
FILE = "file"
|
FILE = "file"
|
||||||
|
NOTION = "notion"
|
||||||
|
@ -8,6 +8,7 @@ from danswer.connectors.danswer_jira.connector import JiraConnector
|
|||||||
from danswer.connectors.file.connector import LocalFileConnector
|
from danswer.connectors.file.connector import LocalFileConnector
|
||||||
from danswer.connectors.github.connector import GithubConnector
|
from danswer.connectors.github.connector import GithubConnector
|
||||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||||
|
from danswer.connectors.notion.connector import NotionConnector
|
||||||
from danswer.connectors.interfaces import BaseConnector
|
from danswer.connectors.interfaces import BaseConnector
|
||||||
from danswer.connectors.interfaces import EventConnector
|
from danswer.connectors.interfaces import EventConnector
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@ -42,6 +43,7 @@ def identify_connector_class(
|
|||||||
DocumentSource.CONFLUENCE: ConfluenceConnector,
|
DocumentSource.CONFLUENCE: ConfluenceConnector,
|
||||||
DocumentSource.JIRA: JiraConnector,
|
DocumentSource.JIRA: JiraConnector,
|
||||||
DocumentSource.SLAB: SlabConnector,
|
DocumentSource.SLAB: SlabConnector,
|
||||||
|
DocumentSource.NOTION: NotionConnector,
|
||||||
}
|
}
|
||||||
connector_by_source = connector_map.get(source, {})
|
connector_by_source = connector_map.get(source, {})
|
||||||
|
|
||||||
|
0
backend/danswer/connectors/notion/__init__.py
Normal file
0
backend/danswer/connectors/notion/__init__.py
Normal file
214
backend/danswer/connectors/notion/connector.py
Normal file
214
backend/danswer/connectors/notion/connector.py
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
"""Notion reader."""
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, fields
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
|
from danswer.connectors.interfaces import PollConnector
|
||||||
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.connectors.models import Section
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NotionPage:
|
||||||
|
id: str
|
||||||
|
created_time: str
|
||||||
|
last_edited_time: str
|
||||||
|
archived: bool
|
||||||
|
properties: Dict[str, Any]
|
||||||
|
url: str
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
names = set([f.name for f in fields(self)])
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
if k in names:
|
||||||
|
setattr(self, k, v)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NotionSearchResponse:
|
||||||
|
results: List[Dict[str, Any]]
|
||||||
|
next_cursor: Optional[str]
|
||||||
|
has_more: bool = False
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
names = set([f.name for f in fields(self)])
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
if k in names:
|
||||||
|
setattr(self, k, v)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO - Add the ability to optionally limit to specific Notion databases
|
||||||
|
class NotionConnector(LoadConnector, PollConnector):
|
||||||
|
"""Notion Page connector that reads all Notion pages
|
||||||
|
this integration has been granted access to.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
batch_size (int): Number of objects to index in a batch
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||||
|
"""Initialize with parameters."""
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Notion-Version": "2022-06-28",
|
||||||
|
}
|
||||||
|
|
||||||
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
self.headers[
|
||||||
|
"Authorization"
|
||||||
|
] = f'Bearer {credentials["notion_integration_token"]}'
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str:
|
||||||
|
"""Read a block."""
|
||||||
|
done = False
|
||||||
|
result_lines_arr = []
|
||||||
|
cur_block_id = block_id
|
||||||
|
while not done:
|
||||||
|
block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children"
|
||||||
|
query_dict: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
res = requests.request(
|
||||||
|
"GET", block_url, headers=self.headers, json=query_dict
|
||||||
|
)
|
||||||
|
data = res.json()
|
||||||
|
|
||||||
|
for result in data["results"]:
|
||||||
|
result_type = result["type"]
|
||||||
|
result_obj = result[result_type]
|
||||||
|
|
||||||
|
cur_result_text_arr = []
|
||||||
|
if "rich_text" in result_obj:
|
||||||
|
for rich_text in result_obj["rich_text"]:
|
||||||
|
# skip if doesn't have text object
|
||||||
|
if "text" in rich_text:
|
||||||
|
text = rich_text["text"]["content"]
|
||||||
|
prefix = "\t" * num_tabs
|
||||||
|
cur_result_text_arr.append(prefix + text)
|
||||||
|
|
||||||
|
result_block_id = result["id"]
|
||||||
|
has_children = result["has_children"]
|
||||||
|
if has_children:
|
||||||
|
children_text = self._read_blocks(
|
||||||
|
result_block_id, num_tabs=num_tabs + 1
|
||||||
|
)
|
||||||
|
cur_result_text_arr.append(children_text)
|
||||||
|
|
||||||
|
cur_result_text = "\n".join(cur_result_text_arr)
|
||||||
|
result_lines_arr.append(cur_result_text)
|
||||||
|
|
||||||
|
if data["next_cursor"] is None:
|
||||||
|
done = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
cur_block_id = data["next_cursor"]
|
||||||
|
|
||||||
|
result_lines = "\n".join(result_lines_arr)
|
||||||
|
return result_lines
|
||||||
|
|
||||||
|
def _read_pages(self, pages: List[NotionPage]) -> List[Document]:
|
||||||
|
"""Read a page."""
|
||||||
|
docs_batch = []
|
||||||
|
for page in pages:
|
||||||
|
page_text = self._read_blocks(page.id)
|
||||||
|
page_title = page.properties.get("Name", None) or page.properties.get(
|
||||||
|
"title", None
|
||||||
|
)
|
||||||
|
if page_title is not None:
|
||||||
|
page_title = " ".join([t["plain_text"] for t in page_title["title"]])
|
||||||
|
else:
|
||||||
|
page_title = f"Untitled Page [{page.id}]"
|
||||||
|
docs_batch.append(
|
||||||
|
Document(
|
||||||
|
id=page.id,
|
||||||
|
sections=[Section(link=page.url, text=page_text)],
|
||||||
|
source=DocumentSource.NOTION,
|
||||||
|
semantic_identifier=page_title,
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return docs_batch
|
||||||
|
|
||||||
|
def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse:
|
||||||
|
"""Get all the pages from a Notion database."""
|
||||||
|
res = requests.post(
|
||||||
|
"https://api.notion.com/v1/search",
|
||||||
|
headers=self.headers,
|
||||||
|
json=query_dict,
|
||||||
|
)
|
||||||
|
res.raise_for_status()
|
||||||
|
return NotionSearchResponse(**res.json())
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
|
"""Load data from the input directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page_ids (List[str]): List of page ids to load.
|
||||||
|
database_id (str): Database_id from which to load page ids.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: List of documents.
|
||||||
|
|
||||||
|
"""
|
||||||
|
query_dict = {
|
||||||
|
"filter": {"property": "object", "value": "page"},
|
||||||
|
"page_size": self.batch_size,
|
||||||
|
}
|
||||||
|
while True:
|
||||||
|
db_res = self._search_notion(query_dict)
|
||||||
|
pages = [NotionPage(**page) for page in db_res.results]
|
||||||
|
yield self._read_pages(pages)
|
||||||
|
if db_res.has_more:
|
||||||
|
query_dict["start_cursor"] = db_res.next_cursor
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
def _filter_pages_by_time(
|
||||||
|
self,
|
||||||
|
pages: List[Dict[str, Any]],
|
||||||
|
start: SecondsSinceUnixEpoch,
|
||||||
|
end: SecondsSinceUnixEpoch,
|
||||||
|
filter_field: str = "last_edited_time",
|
||||||
|
) -> List[NotionPage]:
|
||||||
|
filtered_pages = []
|
||||||
|
for page in pages:
|
||||||
|
compare_time = time.mktime(
|
||||||
|
time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.000Z")
|
||||||
|
)
|
||||||
|
if compare_time <= end or compare_time > start:
|
||||||
|
filtered_pages += [NotionPage(**page)]
|
||||||
|
return filtered_pages
|
||||||
|
|
||||||
|
def poll_source(
|
||||||
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||||
|
) -> GenerateDocumentsOutput:
|
||||||
|
"""Uses the Notion search API to fetch updated pages
|
||||||
|
within a time period.
|
||||||
|
Unfortunately the search API doesn't yet support filtering by times,
|
||||||
|
so until they add that, we're just going to page through results until,
|
||||||
|
we reach ones that are older than our search criteria.
|
||||||
|
"""
|
||||||
|
query_dict = {
|
||||||
|
"page_size": self.batch_size,
|
||||||
|
"sort": {"timestamp": "last_edited_time", "direction": "descending"},
|
||||||
|
"filter": {"property": "object", "value": "page"},
|
||||||
|
}
|
||||||
|
while True:
|
||||||
|
db_res = self._search_notion(query_dict)
|
||||||
|
pages = self._filter_pages_by_time(
|
||||||
|
db_res.results, start, end, filter_field="last_edited_time"
|
||||||
|
)
|
||||||
|
if len(pages) > 0:
|
||||||
|
yield self._read_pages(pages)
|
||||||
|
if db_res.has_more:
|
||||||
|
query_dict["start_cursor"] = db_res.next_cursor
|
||||||
|
else:
|
||||||
|
break
|
Loading…
x
Reference in New Issue
Block a user