diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 219f6688663..65004c4ded5 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -27,3 +27,4 @@ class DocumentSource(str, Enum): SLAB = "slab" JIRA = "jira" FILE = "file" + NOTION = "notion" diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 5377f6f1c9b..5381b0de1f6 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -8,6 +8,7 @@ from danswer.connectors.danswer_jira.connector import JiraConnector from danswer.connectors.file.connector import LocalFileConnector from danswer.connectors.github.connector import GithubConnector from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.notion.connector import NotionConnector from danswer.connectors.interfaces import BaseConnector from danswer.connectors.interfaces import EventConnector from danswer.connectors.interfaces import LoadConnector @@ -42,6 +43,7 @@ def identify_connector_class( DocumentSource.CONFLUENCE: ConfluenceConnector, DocumentSource.JIRA: JiraConnector, DocumentSource.SLAB: SlabConnector, + DocumentSource.NOTION: NotionConnector, } connector_by_source = connector_map.get(source, {}) diff --git a/backend/danswer/connectors/notion/__init__.py b/backend/danswer/connectors/notion/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py new file mode 100644 index 00000000000..87929027954 --- /dev/null +++ b/backend/danswer/connectors/notion/connector.py @@ -0,0 +1,229 @@ +"""Notion reader.""" +import time +from dataclasses import dataclass, fields +from typing import Any, Dict, List, Optional + +import requests + +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section + + +@dataclass +class NotionPage: + """Represents a Notion Page object""" + + id: str + created_time: str + last_edited_time: str + archived: bool + properties: Dict[str, Any] + url: str + + def __init__(self, **kwargs): + names = set([f.name for f in fields(self)]) + for k, v in kwargs.items(): + if k in names: + setattr(self, k, v) + + +@dataclass +class NotionSearchResponse: + """Represents the response from the Notion Search API""" + + results: List[Dict[str, Any]] + next_cursor: Optional[str] + has_more: bool = False + + def __init__(self, **kwargs): + names = set([f.name for f in fields(self)]) + for k, v in kwargs.items(): + if k in names: + setattr(self, k, v) + + +# TODO - Add the ability to optionally limit to specific Notion databases +class NotionConnector(LoadConnector, PollConnector): + """Notion Page connector that reads all Notion pages + this integration has been granted access to. + + Arguments: + batch_size (int): Number of objects to index in a batch + """ + + def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: + """Initialize with parameters.""" + self.batch_size = batch_size + self.headers = { + "Content-Type": "application/json", + "Notion-Version": "2022-06-28", + } + + def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str: + """Reads blocks for a page""" + done = False + result_lines_arr = [] + cur_block_id = block_id + while not done: + block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children" + query_dict: Dict[str, Any] = {} + + res = requests.request( + "GET", block_url, headers=self.headers, json=query_dict + ) + data = res.json() + + for result in data["results"]: + result_type = result["type"] + result_obj = result[result_type] + + cur_result_text_arr = [] + if "rich_text" in result_obj: + for rich_text in result_obj["rich_text"]: + # skip if doesn't have text object + if "text" in rich_text: + text = rich_text["text"]["content"] + prefix = "\t" * num_tabs + cur_result_text_arr.append(prefix + text) + + result_block_id = result["id"] + has_children = result["has_children"] + if has_children: + children_text = self._read_blocks( + result_block_id, num_tabs=num_tabs + 1 + ) + cur_result_text_arr.append(children_text) + + cur_result_text = "\n".join(cur_result_text_arr) + result_lines_arr.append(cur_result_text) + + if data["next_cursor"] is None: + done = True + break + else: + cur_block_id = data["next_cursor"] + + result_lines = "\n".join(result_lines_arr) + return result_lines + + def _read_page_title(self, page: NotionPage) -> str: + """Extracts the title from a Notion page""" + page_title = None + for _, prop in page.properties.items(): + if prop["type"] == "title" and len(prop["title"]) > 0: + page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip() + break + if page_title is None: + page_title = f"Untitled Page [{page.id}]" + return page_title + + def _read_pages(self, pages: List[NotionPage]) -> List[Document]: + """Reads pages for rich text content and generates Documents""" + docs_batch = [] + for page in pages: + page_text = self._read_blocks(page.id) + page_title = self._read_page_title(page) + docs_batch.append( + Document( + id=page.id, + sections=[Section(link=page.url, text=page_text)], + source=DocumentSource.NOTION, + semantic_identifier=page_title, + metadata={}, + ) + ) + return docs_batch + + def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse: + """Search for pages from a Notion database.""" + res = requests.post( + "https://api.notion.com/v1/search", + headers=self.headers, + json=query_dict, + ) + res.raise_for_status() + return NotionSearchResponse(**res.json()) + + def _filter_pages_by_time( + self, + pages: List[Dict[str, Any]], + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + filter_field: str = "last_edited_time", + ) -> List[NotionPage]: + """A helper function to filter out pages outside of a time + range. This functionality doesn't yet exist in the Notion Search API, + but when it does, this approach can be deprecated. + + Arguments: + pages (List[Dict]) - Pages to filter + start (float) - start epoch time to filter from + end (float) - end epoch time to filter to + filter_field (str) - the attribute on the page to apply the filter + """ + filtered_pages = [] + for page in pages: + compare_time = time.mktime( + time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.000Z") + ) + if compare_time <= end or compare_time > start: + filtered_pages += [NotionPage(**page)] + return filtered_pages + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + """Applies integration token to headers""" + self.headers[ + "Authorization" + ] = f'Bearer {credentials["notion_integration_token"]}' + return None + + def load_from_state(self) -> GenerateDocumentsOutput: + """Loads all page data from a Notion workspace. + + Returns: + List[Document]: List of documents. + """ + query_dict = { + "filter": {"property": "object", "value": "page"}, + "page_size": self.batch_size, + } + while True: + db_res = self._search_notion(query_dict) + pages = [NotionPage(**page) for page in db_res.results] + yield self._read_pages(pages) + if db_res.has_more: + query_dict["start_cursor"] = db_res.next_cursor + else: + break + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + ) -> GenerateDocumentsOutput: + """Uses the Notion search API to fetch updated pages + within a time period. + Unfortunately the search API doesn't yet support filtering by times, + so until they add that, we're just going to page through results until, + we reach ones that are older than our search criteria. + """ + query_dict = { + "page_size": self.batch_size, + "sort": {"timestamp": "last_edited_time", "direction": "descending"}, + "filter": {"property": "object", "value": "page"}, + } + while True: + db_res = self._search_notion(query_dict) + pages = self._filter_pages_by_time( + db_res.results, start, end, filter_field="last_edited_time" + ) + if len(pages) > 0: + yield self._read_pages(pages) + if db_res.has_more: + query_dict["start_cursor"] = db_res.next_cursor + else: + break diff --git a/web/src/app/admin/connectors/notion/page.tsx b/web/src/app/admin/connectors/notion/page.tsx new file mode 100644 index 00000000000..9dae184f350 --- /dev/null +++ b/web/src/app/admin/connectors/notion/page.tsx @@ -0,0 +1,229 @@ +"use client"; + +import * as Yup from "yup"; +import { NotionIcon, TrashIcon } from "@/components/icons/icons"; +import { TextFormField } from "@/components/admin/connectors/Field"; +import { HealthCheckBanner } from "@/components/health/healthcheck"; +import { CredentialForm } from "@/components/admin/connectors/CredentialForm"; +import { + NotionCredentialJson, + NotionConfig, + Credential, + ConnectorIndexingStatus, +} from "@/lib/types"; +import useSWR, { useSWRConfig } from "swr"; +import { fetcher } from "@/lib/fetcher"; +import { LoadingAnimation } from "@/components/Loading"; +import { deleteCredential, linkCredential } from "@/lib/credential"; +import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm"; +import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable"; +import { usePopup } from "@/components/admin/connectors/Popup"; + +const Main = () => { + const { popup, setPopup } = usePopup(); + + const { mutate } = useSWRConfig(); + const { + data: connectorIndexingStatuses, + isLoading: isConnectorIndexingStatusesLoading, + error: isConnectorIndexingStatusesError, + } = useSWR[]>( + "/api/manage/admin/connector/indexing-status", + fetcher + ); + const { + data: credentialsData, + isLoading: isCredentialsLoading, + error: isCredentialsError, + } = useSWR[]>( + "/api/manage/credential", + fetcher + ); + + if ( + (!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) || + (!credentialsData && isCredentialsLoading) + ) { + return ; + } + + if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) { + return
Failed to load connectors
; + } + + if (isCredentialsError || !credentialsData) { + return
Failed to load credentials
; + } + + const notionConnectorIndexingStatuses = connectorIndexingStatuses.filter( + (connectorIndexingStatus) => + connectorIndexingStatus.connector.source === "notion" + ); + const notionCredential = credentialsData.filter( + (credential) => credential.credential_json?.notion_integration_token + )[0]; + + return ( + <> + {popup} +

+ Step 1: Provide your authorization details +

+ + {notionCredential ? ( + <> +
+

Existing Integration Token:

+

+ {notionCredential.credential_json?.notion_integration_token} +

+ +
+ + ) : ( + <> +

+ To get started you'll need to create an internal integration in + Notion for Danswer. Follow the instructions in the  + + Notion Developer Documentation + +   on the Notion website, to create a new integration. Once + you've created an integration, copy the integration secret + token and paste it below. Follow the remaining instructions on the + Notion docs to allow Danswer to read Notion Databases and Pages + using the new integration. +

+
+ + formBody={ + + } + validationSchema={Yup.object().shape({ + notion_integration_token: Yup.string().required( + "Please enter the Notion Integration token for the Danswer integration." + ), + })} + initialValues={{ + notion_integration_token: "", + }} + onSubmit={(isSuccess) => { + if (isSuccess) { + mutate("/api/manage/credential"); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> +
+ + )} + + {notionConnectorIndexingStatuses.length > 0 && ( + <> +

+ Notion indexing status +

+

+ The latest page updates are fetched from Notion every 10 minutes. +

+
+ + connectorIndexingStatuses={notionConnectorIndexingStatuses} + liveCredential={notionCredential} + getCredential={(credential) => { + return ( +
+

{credential.credential_json.notion_integration_token}

+
+ ); + }} + onCredentialLink={async (connectorId) => { + if (notionCredential) { + await linkCredential(connectorId, notionCredential.id); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + onUpdate={() => + mutate("/api/manage/admin/connector/indexing-status") + } + /> +
+ + )} + + {notionCredential && notionConnectorIndexingStatuses.length === 0 && ( + <> +
+

Create Connection

+

+ Press connect below to start the connection to Notion. +

+ + nameBuilder={(values) => `NotionConnector`} + source="notion" + inputType="poll" + formBody={<>} + validationSchema={Yup.object().shape({})} + initialValues={{}} + refreshFreq={10 * 60} // 10 minutes + onSubmit={async (isSuccess, responseJson) => { + if (isSuccess && responseJson) { + await linkCredential(responseJson.id, notionCredential.id); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> +
+ + )} + + {!notionCredential && ( + <> +

+ Please provide your integration details in Step 1 first! Once done + with that, you'll be able to start the connection then see + indexing status. +

+ + )} + + ); +}; + +export default function Page() { + return ( +
+
+ +
+
+ +

Notion

+
+
+
+ ); +} diff --git a/web/src/app/admin/layout.tsx b/web/src/app/admin/layout.tsx index e8e89db62e6..f68410c8fcb 100644 --- a/web/src/app/admin/layout.tsx +++ b/web/src/app/admin/layout.tsx @@ -12,6 +12,7 @@ import { FileIcon, JiraIcon, SlabIcon, + NotionIcon, } from "@/components/icons/icons"; import { DISABLE_AUTH } from "@/lib/constants"; import { getCurrentUserSS } from "@/lib/userSS"; @@ -138,6 +139,15 @@ export default async function AdminLayout({ ), link: "/admin/connectors/file", }, + { + name: ( +
+ +
Notion
+
+ ), + link: "/admin/connectors/notion", + }, ], }, { diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index b8593b7f47f..70c7247cf5f 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -18,6 +18,7 @@ import { SiGithub, SiGoogledrive, SiJira, + SiNotion, SiSlack, } from "react-icons/si"; import { FaFile, FaGlobe } from "react-icons/fa"; @@ -160,3 +161,10 @@ export const BrainIcon = ({ }: IconProps) => { return ; }; + +export const NotionIcon = ({ + size = "16", + className = defaultTailwindCSS, +}: IconProps) => { + return ; +}; diff --git a/web/src/components/search/Filters.tsx b/web/src/components/search/Filters.tsx index 431e1a727c3..27dd1f6125d 100644 --- a/web/src/components/search/Filters.tsx +++ b/web/src/components/search/Filters.tsx @@ -14,6 +14,7 @@ const sources: Source[] = [ { displayName: "Github PRs", internalName: "github" }, { displayName: "Web", internalName: "web" }, { displayName: "File", internalName: "file" }, + { displayName: "Notion", internalName: "notion" }, ]; interface SourceSelectorProps { diff --git a/web/src/components/source.tsx b/web/src/components/source.tsx index 360ef6677ea..ab3fa276e36 100644 --- a/web/src/components/source.tsx +++ b/web/src/components/source.tsx @@ -7,6 +7,7 @@ import { GlobeIcon, GoogleDriveIcon, JiraIcon, + NotionIcon, SlabIcon, SlackIcon, } from "./icons/icons"; @@ -73,6 +74,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => { displayName: "Slab", adminPageLink: "/admin/connectors/slab", }; + case "notion": + return { + icon: NotionIcon, + displayName: "Notion", + adminPageLink: "/admin/connectors/notion", + }; default: throw new Error("Invalid source type"); } diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 63de4d18306..5232032c92b 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -16,7 +16,8 @@ export type ValidSources = | "confluence" | "jira" | "slab" - | "file"; + | "file" + | "notion"; export type ValidInputTypes = "load_state" | "poll" | "event"; // CONNECTORS @@ -71,6 +72,8 @@ export interface FileConfig { file_locations: string[]; } +export interface NotionConfig {} + export interface ConnectorIndexingStatus { connector: Connector; public_doc: boolean; @@ -124,3 +127,7 @@ export interface GoogleDriveCredentialJson { export interface SlabCredentialJson { slab_bot_token: string; } + +export interface NotionCredentialJson { + notion_integration_token: string; +}