diff --git a/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py b/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py new file mode 100644 index 000000000000..aae6040a5ee0 --- /dev/null +++ b/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py @@ -0,0 +1,42 @@ +"""Rename index_origin to index_recursively + +Revision ID: 1d6ad76d1f37 +Revises: e1392f05e840 +Create Date: 2024-08-01 12:38:54.466081 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1d6ad76d1f37" +down_revision = "e1392f05e840" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + """ + UPDATE connector + SET connector_specific_config = jsonb_set( + connector_specific_config, + '{index_recursively}', + 'true'::jsonb + ) - 'index_origin' + WHERE connector_specific_config ? 'index_origin' + """ + ) + + +def downgrade() -> None: + op.execute( + """ + UPDATE connector + SET connector_specific_config = jsonb_set( + connector_specific_config, + '{index_origin}', + connector_specific_config->'index_recursively' + ) - 'index_recursively' + WHERE connector_specific_config ? 'index_recursively' + """ + ) diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 1c348df6a3b1..30a9032c9767 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -217,16 +217,19 @@ class RecursiveIndexer: self, batch_size: int, confluence_client: Confluence, - index_origin: bool, + index_recursively: bool, origin_page_id: str, ) -> None: self.batch_size = 1 # batch_size self.confluence_client = confluence_client - self.index_origin = index_origin + self.index_recursively = index_recursively self.origin_page_id = origin_page_id self.pages = self.recurse_children_pages(0, self.origin_page_id) + def get_origin_page(self) -> list[dict[str, Any]]: + return [self._fetch_origin_page()] + def get_pages(self, ind: int, size: int) -> list[dict]: if ind * size > len(self.pages): return [] @@ -282,12 +285,11 @@ class RecursiveIndexer: current_level_pages = next_level_pages next_level_pages = [] - if self.index_origin: - try: - origin_page = self._fetch_origin_page() - pages.append(origin_page) - except Exception as e: - logger.warning(f"Appending origin page with id {page_id} failed: {e}") + try: + origin_page = self._fetch_origin_page() + pages.append(origin_page) + except Exception as e: + logger.warning(f"Appending origin page with id {page_id} failed: {e}") return pages @@ -340,7 +342,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): def __init__( self, wiki_page_url: str, - index_origin: bool = True, + index_recursively: bool = True, batch_size: int = INDEX_BATCH_SIZE, continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, # if a page has one of the labels specified in this list, we will just @@ -352,7 +354,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): self.continue_on_failure = continue_on_failure self.labels_to_skip = set(labels_to_skip) self.recursive_indexer: RecursiveIndexer | None = None - self.index_origin = index_origin + self.index_recursively = index_recursively ( self.wiki_base, self.space, @@ -369,7 +371,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): logger.info( f"wiki_base: {self.wiki_base}, space: {self.space}, page_id: {self.page_id}," - + f" space_level_scan: {self.space_level_scan}, origin: {self.index_origin}" + + f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively}" ) def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: @@ -453,10 +455,13 @@ class ConfluenceConnector(LoadConnector, PollConnector): origin_page_id=self.page_id, batch_size=self.batch_size, confluence_client=self.confluence_client, - index_origin=self.index_origin, + index_recursively=self.index_recursively, ) - return self.recursive_indexer.get_pages(start_ind, batch_size) + if self.index_recursively: + return self.recursive_indexer.get_pages(start_ind, batch_size) + else: + return self.recursive_indexer.get_origin_page() pages: list[dict[str, Any]] = [] diff --git a/web/src/lib/connectors/connectors.ts b/web/src/lib/connectors/connectors.ts index e6f67027ab9d..efda7ae9dfb9 100644 --- a/web/src/lib/connectors/connectors.ts +++ b/web/src/lib/connectors/connectors.ts @@ -218,9 +218,13 @@ export const connectorConfigs: Record = { }, confluence: { description: "Configure Confluence connector", - subtext: `Specify any link to a Confluence page below and click "Index" to Index. Based on the provided link, we will index either the entire page and its subpages OR the entire space. For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page's children (and optionally, itself). Use the checkbox below to determine whether or not to index the parent page in addition to its children. + subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL. + +For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children). -We pull the latest pages and comments from each space listed below every 10 minutes`, +Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself. + +We pull the latest pages and comments from each space every 10 minutes`, values: [ { type: "text", @@ -232,10 +236,11 @@ We pull the latest pages and comments from each space listed below every 10 minu }, { type: "checkbox", - query: "(For pages) Index the page itself", - label: "(For pages) Index the page itself", - name: "index_origin", - optional: true, + query: "Should index pages recursively?", + label: + "Index Recursively (if this is set and the Wiki Page URL leads to a page, we will index the page and all of its children instead of just the page)", + name: "index_recursively", + optional: false, }, ], }, @@ -811,7 +816,7 @@ export interface BookstackConfig {} export interface ConfluenceConfig { wiki_page_url: string; - index_origin?: boolean; + index_recursively?: boolean; } export interface JiraConfig {