support confluence single page only indexing (#2008)

* added index recursively checkbox

* mypy fixes

* added migration to not break existing connectors
This commit is contained in:
hagen-danswer
2024-08-01 13:32:46 -07:00
committed by GitHub
parent a54ea9f9fa
commit e6a92aa936
3 changed files with 72 additions and 20 deletions

View File

@@ -0,0 +1,42 @@
"""Rename index_origin to index_recursively
Revision ID: 1d6ad76d1f37
Revises: e1392f05e840
Create Date: 2024-08-01 12:38:54.466081
"""
from alembic import op
# revision identifiers, used by Alembic.
revision = "1d6ad76d1f37"
down_revision = "e1392f05e840"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.execute(
"""
UPDATE connector
SET connector_specific_config = jsonb_set(
connector_specific_config,
'{index_recursively}',
'true'::jsonb
) - 'index_origin'
WHERE connector_specific_config ? 'index_origin'
"""
)
def downgrade() -> None:
op.execute(
"""
UPDATE connector
SET connector_specific_config = jsonb_set(
connector_specific_config,
'{index_origin}',
connector_specific_config->'index_recursively'
) - 'index_recursively'
WHERE connector_specific_config ? 'index_recursively'
"""
)

View File

@@ -217,16 +217,19 @@ class RecursiveIndexer:
self,
batch_size: int,
confluence_client: Confluence,
index_origin: bool,
index_recursively: bool,
origin_page_id: str,
) -> None:
self.batch_size = 1
# batch_size
self.confluence_client = confluence_client
self.index_origin = index_origin
self.index_recursively = index_recursively
self.origin_page_id = origin_page_id
self.pages = self.recurse_children_pages(0, self.origin_page_id)
def get_origin_page(self) -> list[dict[str, Any]]:
return [self._fetch_origin_page()]
def get_pages(self, ind: int, size: int) -> list[dict]:
if ind * size > len(self.pages):
return []
@@ -282,12 +285,11 @@ class RecursiveIndexer:
current_level_pages = next_level_pages
next_level_pages = []
if self.index_origin:
try:
origin_page = self._fetch_origin_page()
pages.append(origin_page)
except Exception as e:
logger.warning(f"Appending origin page with id {page_id} failed: {e}")
try:
origin_page = self._fetch_origin_page()
pages.append(origin_page)
except Exception as e:
logger.warning(f"Appending origin page with id {page_id} failed: {e}")
return pages
@@ -340,7 +342,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
def __init__(
self,
wiki_page_url: str,
index_origin: bool = True,
index_recursively: bool = True,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
# if a page has one of the labels specified in this list, we will just
@@ -352,7 +354,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
self.continue_on_failure = continue_on_failure
self.labels_to_skip = set(labels_to_skip)
self.recursive_indexer: RecursiveIndexer | None = None
self.index_origin = index_origin
self.index_recursively = index_recursively
(
self.wiki_base,
self.space,
@@ -369,7 +371,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
logger.info(
f"wiki_base: {self.wiki_base}, space: {self.space}, page_id: {self.page_id},"
+ f" space_level_scan: {self.space_level_scan}, origin: {self.index_origin}"
+ f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively}"
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@@ -453,10 +455,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):
origin_page_id=self.page_id,
batch_size=self.batch_size,
confluence_client=self.confluence_client,
index_origin=self.index_origin,
index_recursively=self.index_recursively,
)
return self.recursive_indexer.get_pages(start_ind, batch_size)
if self.index_recursively:
return self.recursive_indexer.get_pages(start_ind, batch_size)
else:
return self.recursive_indexer.get_origin_page()
pages: list[dict[str, Any]] = []

View File

@@ -218,9 +218,13 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
},
confluence: {
description: "Configure Confluence connector",
subtext: `Specify any link to a Confluence page below and click "Index" to Index. Based on the provided link, we will index either the entire page and its subpages OR the entire space. For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page's children (and optionally, itself). Use the checkbox below to determine whether or not to index the parent page in addition to its children.
subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL.
For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children).
We pull the latest pages and comments from each space listed below every 10 minutes`,
Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.
We pull the latest pages and comments from each space every 10 minutes`,
values: [
{
type: "text",
@@ -232,10 +236,11 @@ We pull the latest pages and comments from each space listed below every 10 minu
},
{
type: "checkbox",
query: "(For pages) Index the page itself",
label: "(For pages) Index the page itself",
name: "index_origin",
optional: true,
query: "Should index pages recursively?",
label:
"Index Recursively (if this is set and the Wiki Page URL leads to a page, we will index the page and all of its children instead of just the page)",
name: "index_recursively",
optional: false,
},
],
},
@@ -811,7 +816,7 @@ export interface BookstackConfig {}
export interface ConfluenceConfig {
wiki_page_url: string;
index_origin?: boolean;
index_recursively?: boolean;
}
export interface JiraConfig {