Merge pull request #195 from pkabra/notion-connector

Notion connector
This commit is contained in:
Yuhong Sun
2023-07-21 00:04:12 -07:00
committed by GitHub
10 changed files with 495 additions and 1 deletions

View File

@@ -27,3 +27,4 @@ class DocumentSource(str, Enum):
SLAB = "slab"
JIRA = "jira"
FILE = "file"
NOTION = "notion"

View File

@@ -8,6 +8,7 @@ from danswer.connectors.danswer_jira.connector import JiraConnector
from danswer.connectors.file.connector import LocalFileConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.google_drive.connector import GoogleDriveConnector
from danswer.connectors.notion.connector import NotionConnector
from danswer.connectors.interfaces import BaseConnector
from danswer.connectors.interfaces import EventConnector
from danswer.connectors.interfaces import LoadConnector
@@ -42,6 +43,7 @@ def identify_connector_class(
DocumentSource.CONFLUENCE: ConfluenceConnector,
DocumentSource.JIRA: JiraConnector,
DocumentSource.SLAB: SlabConnector,
DocumentSource.NOTION: NotionConnector,
}
connector_by_source = connector_map.get(source, {})

View File

@@ -0,0 +1,229 @@
"""Notion reader."""
import time
from dataclasses import dataclass, fields
from typing import Any, Dict, List, Optional
import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document
from danswer.connectors.models import Section
@dataclass
class NotionPage:
"""Represents a Notion Page object"""
id: str
created_time: str
last_edited_time: str
archived: bool
properties: Dict[str, Any]
url: str
def __init__(self, **kwargs):
names = set([f.name for f in fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)
@dataclass
class NotionSearchResponse:
"""Represents the response from the Notion Search API"""
results: List[Dict[str, Any]]
next_cursor: Optional[str]
has_more: bool = False
def __init__(self, **kwargs):
names = set([f.name for f in fields(self)])
for k, v in kwargs.items():
if k in names:
setattr(self, k, v)
# TODO - Add the ability to optionally limit to specific Notion databases
class NotionConnector(LoadConnector, PollConnector):
"""Notion Page connector that reads all Notion pages
this integration has been granted access to.
Arguments:
batch_size (int): Number of objects to index in a batch
"""
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
"""Initialize with parameters."""
self.batch_size = batch_size
self.headers = {
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
}
def _read_blocks(self, block_id: str, num_tabs: int = 0) -> str:
"""Reads blocks for a page"""
done = False
result_lines_arr = []
cur_block_id = block_id
while not done:
block_url = f"https://api.notion.com/v1/blocks/{cur_block_id}/children"
query_dict: Dict[str, Any] = {}
res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
for result in data["results"]:
result_type = result["type"]
result_obj = result[result_type]
cur_result_text_arr = []
if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
if "text" in rich_text:
text = rich_text["text"]["content"]
prefix = "\t" * num_tabs
cur_result_text_arr.append(prefix + text)
result_block_id = result["id"]
has_children = result["has_children"]
if has_children:
children_text = self._read_blocks(
result_block_id, num_tabs=num_tabs + 1
)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
result_lines_arr.append(cur_result_text)
if data["next_cursor"] is None:
done = True
break
else:
cur_block_id = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def _read_page_title(self, page: NotionPage) -> str:
"""Extracts the title from a Notion page"""
page_title = None
for _, prop in page.properties.items():
if prop["type"] == "title" and len(prop["title"]) > 0:
page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip()
break
if page_title is None:
page_title = f"Untitled Page [{page.id}]"
return page_title
def _read_pages(self, pages: List[NotionPage]) -> List[Document]:
"""Reads pages for rich text content and generates Documents"""
docs_batch = []
for page in pages:
page_text = self._read_blocks(page.id)
page_title = self._read_page_title(page)
docs_batch.append(
Document(
id=page.id,
sections=[Section(link=page.url, text=page_text)],
source=DocumentSource.NOTION,
semantic_identifier=page_title,
metadata={},
)
)
return docs_batch
def _search_notion(self, query_dict: Dict[str, Any]) -> NotionSearchResponse:
"""Search for pages from a Notion database."""
res = requests.post(
"https://api.notion.com/v1/search",
headers=self.headers,
json=query_dict,
)
res.raise_for_status()
return NotionSearchResponse(**res.json())
def _filter_pages_by_time(
self,
pages: List[Dict[str, Any]],
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
filter_field: str = "last_edited_time",
) -> List[NotionPage]:
"""A helper function to filter out pages outside of a time
range. This functionality doesn't yet exist in the Notion Search API,
but when it does, this approach can be deprecated.
Arguments:
pages (List[Dict]) - Pages to filter
start (float) - start epoch time to filter from
end (float) - end epoch time to filter to
filter_field (str) - the attribute on the page to apply the filter
"""
filtered_pages = []
for page in pages:
compare_time = time.mktime(
time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.000Z")
)
if compare_time <= end or compare_time > start:
filtered_pages += [NotionPage(**page)]
return filtered_pages
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Applies integration token to headers"""
self.headers[
"Authorization"
] = f'Bearer {credentials["notion_integration_token"]}'
return None
def load_from_state(self) -> GenerateDocumentsOutput:
"""Loads all page data from a Notion workspace.
Returns:
List[Document]: List of documents.
"""
query_dict = {
"filter": {"property": "object", "value": "page"},
"page_size": self.batch_size,
}
while True:
db_res = self._search_notion(query_dict)
pages = [NotionPage(**page) for page in db_res.results]
yield self._read_pages(pages)
if db_res.has_more:
query_dict["start_cursor"] = db_res.next_cursor
else:
break
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
"""Uses the Notion search API to fetch updated pages
within a time period.
Unfortunately the search API doesn't yet support filtering by times,
so until they add that, we're just going to page through results until,
we reach ones that are older than our search criteria.
"""
query_dict = {
"page_size": self.batch_size,
"sort": {"timestamp": "last_edited_time", "direction": "descending"},
"filter": {"property": "object", "value": "page"},
}
while True:
db_res = self._search_notion(query_dict)
pages = self._filter_pages_by_time(
db_res.results, start, end, filter_field="last_edited_time"
)
if len(pages) > 0:
yield self._read_pages(pages)
if db_res.has_more:
query_dict["start_cursor"] = db_res.next_cursor
else:
break

View File

@@ -0,0 +1,229 @@
"use client";
import * as Yup from "yup";
import { NotionIcon, TrashIcon } from "@/components/icons/icons";
import { TextFormField } from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
import {
NotionCredentialJson,
NotionConfig,
Credential,
ConnectorIndexingStatus,
} from "@/lib/types";
import useSWR, { useSWRConfig } from "swr";
import { fetcher } from "@/lib/fetcher";
import { LoadingAnimation } from "@/components/Loading";
import { deleteCredential, linkCredential } from "@/lib/credential";
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
import { usePopup } from "@/components/admin/connectors/Popup";
const Main = () => {
const { popup, setPopup } = usePopup();
const { mutate } = useSWRConfig();
const {
data: connectorIndexingStatuses,
isLoading: isConnectorIndexingStatusesLoading,
error: isConnectorIndexingStatusesError,
} = useSWR<ConnectorIndexingStatus<any>[]>(
"/api/manage/admin/connector/indexing-status",
fetcher
);
const {
data: credentialsData,
isLoading: isCredentialsLoading,
error: isCredentialsError,
} = useSWR<Credential<NotionCredentialJson>[]>(
"/api/manage/credential",
fetcher
);
if (
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
(!credentialsData && isCredentialsLoading)
) {
return <LoadingAnimation text="Loading" />;
}
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
return <div>Failed to load connectors</div>;
}
if (isCredentialsError || !credentialsData) {
return <div>Failed to load credentials</div>;
}
const notionConnectorIndexingStatuses = connectorIndexingStatuses.filter(
(connectorIndexingStatus) =>
connectorIndexingStatus.connector.source === "notion"
);
const notionCredential = credentialsData.filter(
(credential) => credential.credential_json?.notion_integration_token
)[0];
return (
<>
{popup}
<h2 className="font-bold mb-2 mt-6 ml-auto mr-auto">
Step 1: Provide your authorization details
</h2>
{notionCredential ? (
<>
<div className="flex mb-1 text-sm">
<p className="my-auto">Existing Integration Token: </p>
<p className="ml-1 italic my-auto max-w-md">
{notionCredential.credential_json?.notion_integration_token}
</p>
<button
className="ml-1 hover:bg-gray-700 rounded-full p-1"
onClick={async () => {
if (notionConnectorIndexingStatuses.length > 0) {
setPopup({
type: "error",
message:
"Must delete all connectors before deleting credentials",
});
return;
}
await deleteCredential(notionCredential.id);
mutate("/api/manage/credential");
}}
>
<TrashIcon />
</button>
</div>
</>
) : (
<>
<p className="text-sm">
To get started you&apos;ll need to create an internal integration in
Notion for Danswer. Follow the instructions in the&nbsp;
<a
href="https://developers.notion.com/docs/create-a-notion-integration"
target="_blank"
>
Notion Developer Documentation
</a>
&nbsp; on the Notion website, to create a new integration. Once
you&apos;ve created an integration, copy the integration secret
token and paste it below. Follow the remaining instructions on the
Notion docs to allow Danswer to read Notion Databases and Pages
using the new integration.
</p>
<div className="border-solid border-gray-600 border rounded-md p-6 mt-2 mb-4">
<CredentialForm<NotionCredentialJson>
formBody={
<TextFormField
name="notion_integration_token"
label="Integration Token:"
type="password"
/>
}
validationSchema={Yup.object().shape({
notion_integration_token: Yup.string().required(
"Please enter the Notion Integration token for the Danswer integration."
),
})}
initialValues={{
notion_integration_token: "",
}}
onSubmit={(isSuccess) => {
if (isSuccess) {
mutate("/api/manage/credential");
mutate("/api/manage/admin/connector/indexing-status");
}
}}
/>
</div>
</>
)}
{notionConnectorIndexingStatuses.length > 0 && (
<>
<h2 className="font-bold mb-2 mt-6 ml-auto mr-auto">
Notion indexing status
</h2>
<p className="text-sm mb-2">
The latest page updates are fetched from Notion every 10 minutes.
</p>
<div className="mb-2">
<ConnectorsTable<NotionConfig, NotionCredentialJson>
connectorIndexingStatuses={notionConnectorIndexingStatuses}
liveCredential={notionCredential}
getCredential={(credential) => {
return (
<div>
<p>{credential.credential_json.notion_integration_token}</p>
</div>
);
}}
onCredentialLink={async (connectorId) => {
if (notionCredential) {
await linkCredential(connectorId, notionCredential.id);
mutate("/api/manage/admin/connector/indexing-status");
}
}}
onUpdate={() =>
mutate("/api/manage/admin/connector/indexing-status")
}
/>
</div>
</>
)}
{notionCredential && notionConnectorIndexingStatuses.length === 0 && (
<>
<div className="border-solid border-gray-600 border rounded-md p-6 mt-4">
<h2 className="font-bold mb-3">Create Connection</h2>
<p className="text-sm mb-4">
Press connect below to start the connection to Notion.
</p>
<ConnectorForm<NotionConfig>
nameBuilder={(values) => `NotionConnector`}
source="notion"
inputType="poll"
formBody={<></>}
validationSchema={Yup.object().shape({})}
initialValues={{}}
refreshFreq={10 * 60} // 10 minutes
onSubmit={async (isSuccess, responseJson) => {
if (isSuccess && responseJson) {
await linkCredential(responseJson.id, notionCredential.id);
mutate("/api/manage/admin/connector/indexing-status");
}
}}
/>
</div>
</>
)}
{!notionCredential && (
<>
<p className="text-sm mb-4">
Please provide your integration details in Step 1 first! Once done
with that, you&apos;ll be able to start the connection then see
indexing status.
</p>
</>
)}
</>
);
};
export default function Page() {
return (
<div className="mx-auto container">
<div className="mb-4">
<HealthCheckBanner />
</div>
<div className="border-solid border-gray-600 border-b mb-4 pb-2 flex">
<NotionIcon size="32" />
<h1 className="text-3xl font-bold pl-2">Notion</h1>
</div>
<Main />
</div>
);
}

View File

@@ -12,6 +12,7 @@ import {
FileIcon,
JiraIcon,
SlabIcon,
NotionIcon,
} from "@/components/icons/icons";
import { DISABLE_AUTH } from "@/lib/constants";
import { getCurrentUserSS } from "@/lib/userSS";
@@ -138,6 +139,15 @@ export default async function AdminLayout({
),
link: "/admin/connectors/file",
},
{
name: (
<div className="flex">
<NotionIcon size="16" />
<div className="ml-1">Notion</div>
</div>
),
link: "/admin/connectors/notion",
},
],
},
{

View File

@@ -18,6 +18,7 @@ import {
SiGithub,
SiGoogledrive,
SiJira,
SiNotion,
SiSlack,
} from "react-icons/si";
import { FaFile, FaGlobe } from "react-icons/fa";
@@ -160,3 +161,10 @@ export const BrainIcon = ({
}: IconProps) => {
return <Brain size={size} className={className} />;
};
export const NotionIcon = ({
size = "16",
className = defaultTailwindCSS,
}: IconProps) => {
return <SiNotion size={size} className={className} />;
};

View File

@@ -14,6 +14,7 @@ const sources: Source[] = [
{ displayName: "Github PRs", internalName: "github" },
{ displayName: "Web", internalName: "web" },
{ displayName: "File", internalName: "file" },
{ displayName: "Notion", internalName: "notion" },
];
interface SourceSelectorProps {

View File

@@ -7,6 +7,7 @@ import {
GlobeIcon,
GoogleDriveIcon,
JiraIcon,
NotionIcon,
SlabIcon,
SlackIcon,
} from "./icons/icons";
@@ -73,6 +74,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => {
displayName: "Slab",
adminPageLink: "/admin/connectors/slab",
};
case "notion":
return {
icon: NotionIcon,
displayName: "Notion",
adminPageLink: "/admin/connectors/notion",
};
default:
throw new Error("Invalid source type");
}

View File

@@ -16,7 +16,8 @@ export type ValidSources =
| "confluence"
| "jira"
| "slab"
| "file";
| "file"
| "notion";
export type ValidInputTypes = "load_state" | "poll" | "event";
// CONNECTORS
@@ -71,6 +72,8 @@ export interface FileConfig {
file_locations: string[];
}
export interface NotionConfig {}
export interface ConnectorIndexingStatus<T> {
connector: Connector<T>;
public_doc: boolean;
@@ -124,3 +127,7 @@ export interface GoogleDriveCredentialJson {
export interface SlabCredentialJson {
slab_bot_token: string;
}
export interface NotionCredentialJson {
notion_integration_token: string;
}