mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-28 20:50:00 +02:00
Notion improvement (#4306)
* Notion connector improvements * Enable recursive index by default * Small tweak
This commit is contained in:
parent
72bf427cc2
commit
5dda53eec3
@ -347,8 +347,8 @@ HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
|
|||||||
HtmlBasedConnectorTransformLinksStrategy.STRIP,
|
HtmlBasedConnectorTransformLinksStrategy.STRIP,
|
||||||
)
|
)
|
||||||
|
|
||||||
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
|
NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP = (
|
||||||
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
|
os.environ.get("NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
|
||||||
== "true"
|
== "true"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from dataclasses import dataclass
|
|
||||||
from dataclasses import fields
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from typing import cast
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from pydantic import BaseModel
|
||||||
from retry import retry
|
from retry import retry
|
||||||
|
|
||||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from onyx.configs.app_configs import NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP
|
from onyx.configs.app_configs import NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP
|
||||||
from onyx.configs.constants import DocumentSource
|
from onyx.configs.constants import DocumentSource
|
||||||
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
|
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||||
rl_requests,
|
rl_requests,
|
||||||
@ -25,6 +25,7 @@ from onyx.connectors.interfaces import PollConnector
|
|||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||||
from onyx.connectors.models import Document
|
from onyx.connectors.models import Document
|
||||||
|
from onyx.connectors.models import ImageSection
|
||||||
from onyx.connectors.models import TextSection
|
from onyx.connectors.models import TextSection
|
||||||
from onyx.utils.batching import batch_generator
|
from onyx.utils.batching import batch_generator
|
||||||
from onyx.utils.logger import setup_logger
|
from onyx.utils.logger import setup_logger
|
||||||
@ -38,8 +39,7 @@ _NOTION_CALL_TIMEOUT = 30 # 30 seconds
|
|||||||
# TODO: Tables need to be ingested, Pages need to have their metadata ingested
|
# TODO: Tables need to be ingested, Pages need to have their metadata ingested
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class NotionPage(BaseModel):
|
||||||
class NotionPage:
|
|
||||||
"""Represents a Notion Page object"""
|
"""Represents a Notion Page object"""
|
||||||
|
|
||||||
id: str
|
id: str
|
||||||
@ -49,17 +49,10 @@ class NotionPage:
|
|||||||
properties: dict[str, Any]
|
properties: dict[str, Any]
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
database_name: str | None # Only applicable to the database type page (wiki)
|
database_name: str | None = None # Only applicable to the database type page (wiki)
|
||||||
|
|
||||||
def __init__(self, **kwargs: dict[str, Any]) -> None:
|
|
||||||
names = set([f.name for f in fields(self)])
|
|
||||||
for k, v in kwargs.items():
|
|
||||||
if k in names:
|
|
||||||
setattr(self, k, v)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class NotionBlock(BaseModel):
|
||||||
class NotionBlock:
|
|
||||||
"""Represents a Notion Block object"""
|
"""Represents a Notion Block object"""
|
||||||
|
|
||||||
id: str # Used for the URL
|
id: str # Used for the URL
|
||||||
@ -69,20 +62,13 @@ class NotionBlock:
|
|||||||
prefix: str
|
prefix: str
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class NotionSearchResponse(BaseModel):
|
||||||
class NotionSearchResponse:
|
|
||||||
"""Represents the response from the Notion Search API"""
|
"""Represents the response from the Notion Search API"""
|
||||||
|
|
||||||
results: list[dict[str, Any]]
|
results: list[dict[str, Any]]
|
||||||
next_cursor: Optional[str]
|
next_cursor: Optional[str]
|
||||||
has_more: bool = False
|
has_more: bool = False
|
||||||
|
|
||||||
def __init__(self, **kwargs: dict[str, Any]) -> None:
|
|
||||||
names = set([f.name for f in fields(self)])
|
|
||||||
for k, v in kwargs.items():
|
|
||||||
if k in names:
|
|
||||||
setattr(self, k, v)
|
|
||||||
|
|
||||||
|
|
||||||
class NotionConnector(LoadConnector, PollConnector):
|
class NotionConnector(LoadConnector, PollConnector):
|
||||||
"""Notion Page connector that reads all Notion pages
|
"""Notion Page connector that reads all Notion pages
|
||||||
@ -95,7 +81,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
recursive_index_enabled: bool = NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP,
|
recursive_index_enabled: bool = not NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP,
|
||||||
root_page_id: str | None = None,
|
root_page_id: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with parameters."""
|
"""Initialize with parameters."""
|
||||||
@ -464,23 +450,53 @@ class NotionConnector(LoadConnector, PollConnector):
|
|||||||
page_blocks, child_page_ids = self._read_blocks(page.id)
|
page_blocks, child_page_ids = self._read_blocks(page.id)
|
||||||
all_child_page_ids.extend(child_page_ids)
|
all_child_page_ids.extend(child_page_ids)
|
||||||
|
|
||||||
|
# okay to mark here since there's no way for this to not succeed
|
||||||
|
# without a critical failure
|
||||||
|
self.indexed_pages.add(page.id)
|
||||||
|
|
||||||
|
raw_page_title = self._read_page_title(page)
|
||||||
|
page_title = raw_page_title or f"Untitled Page with ID {page.id}"
|
||||||
|
|
||||||
if not page_blocks:
|
if not page_blocks:
|
||||||
|
if not raw_page_title:
|
||||||
|
logger.warning(
|
||||||
|
f"No blocks OR title found for page with ID '{page.id}'. Skipping."
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
page_title = (
|
logger.debug(f"No blocks found for page with ID '{page.id}'")
|
||||||
self._read_page_title(page) or f"Untitled Page with ID {page.id}"
|
"""
|
||||||
)
|
Something like:
|
||||||
|
|
||||||
yield (
|
TITLE
|
||||||
Document(
|
|
||||||
id=page.id,
|
PROP1: PROP1_VALUE
|
||||||
sections=[
|
PROP2: PROP2_VALUE
|
||||||
|
"""
|
||||||
|
text = page_title
|
||||||
|
if page.properties:
|
||||||
|
text += "\n\n" + "\n".join(
|
||||||
|
[f"{key}: {value}" for key, value in page.properties.items()]
|
||||||
|
)
|
||||||
|
sections = [
|
||||||
|
TextSection(
|
||||||
|
link=f"{page.url}",
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
sections = [
|
||||||
TextSection(
|
TextSection(
|
||||||
link=f"{page.url}#{block.id.replace('-', '')}",
|
link=f"{page.url}#{block.id.replace('-', '')}",
|
||||||
text=block.prefix + block.text,
|
text=block.prefix + block.text,
|
||||||
)
|
)
|
||||||
for block in page_blocks
|
for block in page_blocks
|
||||||
],
|
]
|
||||||
|
|
||||||
|
yield (
|
||||||
|
Document(
|
||||||
|
id=page.id,
|
||||||
|
sections=cast(list[TextSection | ImageSection], sections),
|
||||||
source=DocumentSource.NOTION,
|
source=DocumentSource.NOTION,
|
||||||
semantic_identifier=page_title,
|
semantic_identifier=page_title,
|
||||||
doc_updated_at=datetime.fromisoformat(
|
doc_updated_at=datetime.fromisoformat(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user