Add cql support for confluence connector (#2679)

* Added CQL support for Confluence * changed string substitutions for CQL * final cleanup * updated string fixes * remove print statements * Update description
2025-05-20 16:51:06 +02:00 · 2024-10-10 12:16:56 -07:00 · 2024-10-10 12:16:56 -07:00 · 1f4fe42f4b
commit 1f4fe42f4b
parent 101b010c5c
5 changed files with 339 additions and 198 deletions
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@ -1,5 +1,6 @@
 import io
 import os
+import re
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@ -56,8 +57,101 @@ NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
 )


+class DanswerConfluence(Confluence):
+    """
+    This is a custom Confluence class that overrides the default Confluence class to add a custom CQL method.
+    This is necessary because the default Confluence class does not properly support cql expansions.
+    """
+
+    def __init__(self, url: str, *args: Any, **kwargs: Any) -> None:
+        super(DanswerConfluence, self).__init__(url, *args, **kwargs)
+
+    def danswer_cql(
+        self,
+        cql: str,
+        expand: str | None = None,
+        start: int = 0,
+        limit: int = 500,
+        include_archived_spaces: bool = False,
+    ) -> list[dict[str, Any]]:
+        # Performs the query expansion and start/limit url additions
+        url_suffix = f"rest/api/content/search?cql={cql}"
+        if expand:
+            url_suffix += f"&expand={expand}"
+        url_suffix += f"&start={start}&limit={limit}"
+        if include_archived_spaces:
+            url_suffix += "&includeArchivedSpaces=true"
+        try:
+            response = self.get(url_suffix)
+            return response.get("results", [])
+        except Exception as e:
+            raise e
+
+
+def _replace_cql_time_filter(
+    cql_query: str, start_time: datetime, end_time: datetime
+) -> str:
+    """
+    This function replaces the lastmodified filter in the CQL query with the start and end times.
+    This selects the more restrictive time range.
+    """
+    # Extract existing lastmodified >= and <= filters
+    existing_start_match = re.search(
+        r'lastmodified\s*>=\s*["\']?(\d{4}-\d{2}-\d{2}(?:\s+\d{2}:\d{2})?)["\']?',
+        cql_query,
+        flags=re.IGNORECASE,
+    )
+    existing_end_match = re.search(
+        r'lastmodified\s*<=\s*["\']?(\d{4}-\d{2}-\d{2}(?:\s+\d{2}:\d{2})?)["\']?',
+        cql_query,
+        flags=re.IGNORECASE,
+    )
+
+    # Remove all existing lastmodified and updated filters
+    cql_query = re.sub(
+        r'\s*AND\s+(lastmodified|updated)\s*[<>=]+\s*["\']?[\d-]+(?:\s+[\d:]+)?["\']?',
+        "",
+        cql_query,
+        flags=re.IGNORECASE,
+    )
+
+    # Determine the start time to use
+    if existing_start_match:
+        existing_start_str = existing_start_match.group(1)
+        existing_start = datetime.strptime(
+            existing_start_str,
+            "%Y-%m-%d %H:%M" if " " in existing_start_str else "%Y-%m-%d",
+        )
+        existing_start = existing_start.replace(
+            tzinfo=timezone.utc
+        )  # Make offset-aware
+        start_time_to_use = max(start_time.astimezone(timezone.utc), existing_start)
+    else:
+        start_time_to_use = start_time.astimezone(timezone.utc)
+
+    # Determine the end time to use
+    if existing_end_match:
+        existing_end_str = existing_end_match.group(1)
+        existing_end = datetime.strptime(
+            existing_end_str,
+            "%Y-%m-%d %H:%M" if " " in existing_end_str else "%Y-%m-%d",
+        )
+        existing_end = existing_end.replace(tzinfo=timezone.utc)  # Make offset-aware
+        end_time_to_use = min(end_time.astimezone(timezone.utc), existing_end)
+    else:
+        end_time_to_use = end_time.astimezone(timezone.utc)
+
+    # Add new time filters
+    cql_query += (
+        f" and lastmodified >= '{start_time_to_use.strftime('%Y-%m-%d %H:%M')}'"
+    )
+    cql_query += f" and lastmodified <= '{end_time_to_use.strftime('%Y-%m-%d %H:%M')}'"
+
+    return cql_query.strip()
+
+
@lru_cache()
-def _get_user(user_id: str, confluence_client: Confluence) -> str:
+def _get_user(user_id: str, confluence_client: DanswerConfluence) -> str:
    """Get Confluence Display Name based on the account-id or userkey value

    Args:
@ -81,7 +175,7 @@ def _get_user(user_id: str, confluence_client: Confluence) -> str:
    return user_not_found


-def parse_html_page(text: str, confluence_client: Confluence) -> str:
+def parse_html_page(text: str, confluence_client: DanswerConfluence) -> str:
    """Parse a Confluence html page and replace the 'user Id' by the real
        User Display Name

@ -112,7 +206,7 @@ def parse_html_page(text: str, confluence_client: Confluence) -> str:
 def _comment_dfs(
    comments_str: str,
    comment_pages: Collection[dict[str, Any]],
-    confluence_client: Confluence,
+    confluence_client: DanswerConfluence,
 ) -> str:
    get_page_child_by_type = make_confluence_call_handle_rate_limit(
        confluence_client.get_page_child_by_type
@ -159,7 +253,7 @@ class RecursiveIndexer:
    def __init__(
        self,
        batch_size: int,
-        confluence_client: Confluence,
+        confluence_client: DanswerConfluence,
        index_recursively: bool,
        origin_page_id: str,
    ) -> None:
@ -285,8 +379,8 @@ class ConfluenceConnector(LoadConnector, PollConnector):
    def __init__(
        self,
        wiki_base: str,
-        space: str,
        is_cloud: bool,
+        space: str = "",
        page_id: str = "",
        index_recursively: bool = True,
        batch_size: int = INDEX_BATCH_SIZE,
@ -295,35 +389,44 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        # skip it. This is generally used to avoid indexing extra sensitive
        # pages.
        labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
+        cql_query: str | None = None,
    ) -> None:
        self.batch_size = batch_size
        self.continue_on_failure = continue_on_failure
        self.labels_to_skip = set(labels_to_skip)
        self.recursive_indexer: RecursiveIndexer | None = None
-        self.index_recursively = index_recursively
+        self.index_recursively = False if cql_query else index_recursively

        # Remove trailing slash from wiki_base if present
        self.wiki_base = wiki_base.rstrip("/")
        self.space = space
-        self.page_id = page_id
+        self.page_id = "" if cql_query else page_id
+        self.space_level_scan = bool(not self.page_id)

        self.is_cloud = is_cloud

-        self.space_level_scan = False
-        self.confluence_client: Confluence | None = None
+        self.confluence_client: DanswerConfluence | None = None

-        if self.page_id is None or self.page_id == "":
-            self.space_level_scan = True
+        # if a cql_query is provided, we will use it to fetch the pages
+        # if no cql_query is provided, we will use the space to fetch the pages
+        # if no space is provided, we will default to fetching all pages, regardless of space
+        if cql_query:
+            self.cql_query = cql_query
+        elif self.space:
+            self.cql_query = f"type=page and space={self.space}"
+        else:
+            self.cql_query = "type=page"

        logger.info(
            f"wiki_base: {self.wiki_base}, space: {self.space}, page_id: {self.page_id},"
-            + f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively}"
+            + f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively},"
+            + f" cql_query: {self.cql_query}"
        )

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        username = credentials["confluence_username"]
        access_token = credentials["confluence_access_token"]
-        self.confluence_client = Confluence(
+        self.confluence_client = DanswerConfluence(
            url=self.wiki_base,
            # passing in username causes issues for Confluence data center
            username=username if self.is_cloud else None,
@ -334,26 +437,33 @@ class ConfluenceConnector(LoadConnector, PollConnector):

    def _fetch_pages(
        self,
-        confluence_client: Confluence,
        start_ind: int,
    ) -> list[dict[str, Any]]:
        def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
-            get_all_pages_from_space = make_confluence_call_handle_rate_limit(
-                confluence_client.get_all_pages_from_space
+            if self.confluence_client is None:
+                raise ConnectorMissingCredentialError("Confluence")
+
+            get_all_pages = make_confluence_call_handle_rate_limit(
+                self.confluence_client.danswer_cql
            )
+
+            include_archived_spaces = (
+                CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES
+                if not self.is_cloud
+                else False
+            )
+
            try:
-                return get_all_pages_from_space(
-                    self.space,
+                return get_all_pages(
+                    cql=self.cql_query,
                    start=start_ind,
                    limit=batch_size,
-                    status=(
-                        None if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES else "current"
-                    ),
                    expand="body.storage.value,version",
+                    include_archived_spaces=include_archived_spaces,
                )
            except Exception:
                logger.warning(
-                    f"Batch failed with space {self.space} at offset {start_ind} "
+                    f"Batch failed with cql {self.cql_query} at offset {start_ind} "
                    f"with size {batch_size}, processing pages individually..."
                )

@ -363,27 +473,23 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                        # Could be that one of the pages here failed due to this bug:
                        # https://jira.atlassian.com/browse/CONFCLOUD-76433
                        view_pages.extend(
-                            get_all_pages_from_space(
-                                self.space,
+                            get_all_pages(
+                                cql=self.cql_query,
                                start=start_ind + i,
                                limit=1,
-                                status=(
-                                    None
-                                    if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES
-                                    else "current"
-                                ),
                                expand="body.storage.value,version",
+                                include_archived_spaces=include_archived_spaces,
                            )
                        )
                    except HTTPError as e:
                        logger.warning(
-                            f"Page failed with space {self.space} at offset {start_ind + i}, "
+                            f"Page failed with cql {self.cql_query} at offset {start_ind + i}, "
                            f"trying alternative expand option: {e}"
                        )
                        # Use view instead, which captures most info but is less complete
                        view_pages.extend(
-                            get_all_pages_from_space(
-                                self.space,
+                            get_all_pages(
+                                cql=self.cql_query,
                                start=start_ind + i,
                                limit=1,
                                expand="body.view.value,version",
@ -393,6 +499,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                return view_pages

        def _fetch_page(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
+            if self.confluence_client is None:
+                raise ConnectorMissingCredentialError("Confluence")
+
            if self.recursive_indexer is None:
                self.recursive_indexer = RecursiveIndexer(
                    origin_page_id=self.page_id,
@ -421,7 +530,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                raise e

        # error checking phase, only reachable if `self.continue_on_failure=True`
-        for i in range(self.batch_size):
+        for _ in range(self.batch_size):
            try:
                pages = (
                    _fetch_space(start_ind, self.batch_size)
@ -437,7 +546,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):

        return pages

-    def _fetch_comments(self, confluence_client: Confluence, page_id: str) -> str:
+    def _fetch_comments(
+        self, confluence_client: DanswerConfluence, page_id: str
+    ) -> str:
        get_page_child_by_type = make_confluence_call_handle_rate_limit(
            confluence_client.get_page_child_by_type
        )
@ -463,7 +574,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            )
            return ""

-    def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
+    def _fetch_labels(
+        self, confluence_client: DanswerConfluence, page_id: str
+    ) -> list[str]:
        get_page_labels = make_confluence_call_handle_rate_limit(
            confluence_client.get_page_labels
        )
@ -577,22 +690,20 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        return "\n".join(files_attachment_content), unused_attachments

    def _get_doc_batch(
-        self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
+        self, start_ind: int
    ) -> tuple[list[Document], list[dict[str, Any]], int]:
+        if self.confluence_client is None:
+            raise ConnectorMissingCredentialError("Confluence")
+
        doc_batch: list[Document] = []
        unused_attachments: list[dict[str, Any]] = []

-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-        batch = self._fetch_pages(self.confluence_client, start_ind)
+        batch = self._fetch_pages(start_ind)

        for page in batch:
            last_modified = _datetime_from_string(page["version"]["when"])
            author = cast(str | None, page["version"].get("by", {}).get("email"))

-            if time_filter and not time_filter(last_modified):
-                continue
-
            page_id = page["id"]

            if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
@ -715,17 +826,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        return doc_batch, end_ind - start_ind

    def load_from_state(self) -> GenerateDocumentsOutput:
-        unused_attachments = []
-
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
+        unused_attachments: list[dict[str, Any]] = []

        start_ind = 0
        while True:
-            doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch(
-                start_ind
-            )
-            unused_attachments.extend(unused_attachments_batch)
+            doc_batch, unused_attachments, num_pages = self._get_doc_batch(start_ind)
+            unused_attachments.extend(unused_attachments)
            start_ind += num_pages
            if doc_batch:
                yield doc_batch
@ -748,7 +854,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
-        unused_attachments = []
+        unused_attachments: list[dict[str, Any]] = []

        if self.confluence_client is None:
            raise ConnectorMissingCredentialError("Confluence")
@ -756,12 +862,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        start_time = datetime.fromtimestamp(start, tz=timezone.utc)
        end_time = datetime.fromtimestamp(end, tz=timezone.utc)

+        self.cql_query = _replace_cql_time_filter(self.cql_query, start_time, end_time)
+
        start_ind = 0
        while True:
-            doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch(
-                start_ind, time_filter=lambda t: start_time <= t <= end_time
-            )
-            unused_attachments.extend(unused_attachments_batch)
+            doc_batch, unused_attachments, num_pages = self._get_doc_batch(start_ind)
+            unused_attachments.extend(unused_attachments)

            start_ind += num_pages
            if doc_batch:
--- a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
+++ b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
@ -1,6 +1,6 @@
 "use client";

-import { errorHandlingFetcher } from "@/lib/fetcher";
+import { FetchError, errorHandlingFetcher } from "@/lib/fetcher";
 import useSWR, { mutate } from "swr";
 import { HealthCheckBanner } from "@/components/health/healthcheck";

@ -209,7 +209,15 @@ export default function AddConnector({

  return (
    <Formik
-      initialValues={createConnectorInitialValues(connector)}
+      initialValues={{
+        ...createConnectorInitialValues(connector),
+        ...Object.fromEntries(
+          connectorConfigs[connector].advanced_values.map((field) => [
+            field.name,
+            field.default || "",
+          ])
+        ),
+      }}
      validationSchema={createConnectorValidationSchema(connector)}
      onSubmit={async (values) => {
        const {
--- a/web/src/app/admin/connectors/[connector]/pages/ConnectorInput/ListInput.tsx
+++ b/web/src/app/admin/connectors/[connector]/pages/ConnectorInput/ListInput.tsx
@ -1,74 +1,24 @@
-import CredentialSubText from "@/components/credentials/CredentialFields";
-import { TrashIcon } from "@/components/icons/icons";
+import React from "react";
 import { ListOption } from "@/lib/connectors/connectors";
-import { Field, FieldArray, useField } from "formik";
-import { FaPlus } from "react-icons/fa";
+import { TextArrayField } from "@/components/admin/connectors/Field";
+import { useFormikContext } from "formik";

-export default function ListInput({
-  field,
-  onUpdate,
-}: {
+interface ListInputProps {
  field: ListOption;
-  onUpdate?: (values: string[]) => void;
-}) {
-  const [fieldProps, , helpers] = useField(field.name);
+}
+
+const ListInput: React.FC<ListInputProps> = ({ field }) => {
+  const { values } = useFormikContext();

  return (
-    <FieldArray name={field.name}>
-      {({ push, remove }) => (
-        <div>
-          <label
-            htmlFor={field.name}
-            className="block text-sm font-medium text-text-700 mb-1"
-          >
-            {field.label}
-            {field.optional && (
-              <span className="text-text-500 ml-1">(optional)</span>
-            )}
-          </label>
-          {field.description && (
-            <CredentialSubText>{field.description}</CredentialSubText>
-          )}
-
-          {fieldProps.value.map((value: string, index: number) => (
-            <div key={index} className="w-full flex mb-4">
-              <Field
-                name={`${field.name}.${index}`}
-                className="w-full bg-input text-sm p-2 border border-border-medium rounded-md focus:ring-2 focus:ring-indigo-500 focus:border-indigo-500 mr-2"
-              />
-              <button
-                className="p-2 my-auto bg-input flex-none rounded-md bg-red-500 text-white hover:bg-red-600 focus:outline-none focus:ring-2 focus:ring-red-500 focus:ring-opacity-50"
-                type="button"
-                onClick={() => {
-                  remove(index);
-                  if (onUpdate) {
-                    const newValue = fieldProps.value.filter(
-                      (_: any, i: number) => i !== index
-                    );
-                    onUpdate(newValue);
-                  }
-                }}
-              >
-                <TrashIcon className="text-white my-auto" />
-              </button>
-            </div>
-          ))}
-
-          <button
-            type="button"
-            onClick={() => {
-              push("");
-              if (onUpdate) {
-                onUpdate([...fieldProps.value, ""]);
-              }
-            }}
-            className="mt-2 p-2 bg-rose-500 text-xs text-white rounded-md hover:bg-rose-600 focus:outline-none focus:ring-2 focus:ring-rose-500 focus:ring-opacity-50 flex items-center"
-          >
-            <FaPlus className="mr-2" />
-            Add {field.label}
-          </button>
-        </div>
-      )}
-    </FieldArray>
+    <TextArrayField
+      name={field.name}
+      label={field.label}
+      values={Array.isArray(values) ? values : []}
+      subtext={field.description}
+      placeholder={`Enter ${field.label.toLowerCase()}`}
+    />
  );
-}
+};
+
+export default ListInput;
--- a/web/src/app/admin/connectors/[connector]/pages/DynamicConnectorCreationForm.tsx
+++ b/web/src/app/admin/connectors/[connector]/pages/DynamicConnectorCreationForm.tsx
@ -1,4 +1,4 @@
-import React, { Dispatch, FC, SetStateAction } from "react";
+import React, { Dispatch, FC, SetStateAction, useState } from "react";
 import CredentialSubText, {
  AdminBooleanFormField,
 } from "@/components/credentials/CredentialFields";
@ -9,6 +9,7 @@ import NumberInput from "./ConnectorInput/NumberInput";
 import { TextFormField } from "@/components/admin/connectors/Field";
 import ListInput from "./ConnectorInput/ListInput";
 import FileInput from "./ConnectorInput/FileInput";
+import { AdvancedOptionsToggle } from "@/components/AdvancedOptionsToggle";

 export interface DynamicConnectionFormProps {
  config: ConnectionConfiguration;
@ -23,6 +24,61 @@ const DynamicConnectionForm: FC<DynamicConnectionFormProps> = ({
  setSelectedFiles,
  values,
 }) => {
+  const [showAdvancedOptions, setShowAdvancedOptions] = useState(false);
+
+  const renderField = (field: any) => (
+    <div key={field.name}>
+      {field.type === "file" ? (
+        <FileUpload
+          name={field.name}
+          selectedFiles={selectedFiles}
+          setSelectedFiles={setSelectedFiles}
+        />
+      ) : field.type === "zip" ? (
+        <FileInput
+          name={field.name}
+          label={field.label}
+          optional={field.optional}
+          description={field.description}
+          selectedFiles={selectedFiles}
+          setSelectedFiles={setSelectedFiles}
+        />
+      ) : field.type === "list" ? (
+        <ListInput field={field} />
+      ) : field.type === "select" ? (
+        <SelectInput
+          name={field.name}
+          optional={field.optional}
+          description={field.description}
+          options={field.options || []}
+          label={field.label}
+        />
+      ) : field.type === "number" ? (
+        <NumberInput
+          label={field.label}
+          optional={field.optional}
+          description={field.description}
+          name={field.name}
+        />
+      ) : field.type === "checkbox" ? (
+        <AdminBooleanFormField
+          checked={values[field.name]}
+          subtext={field.description}
+          name={field.name}
+          label={field.label}
+        />
+      ) : (
+        <TextFormField
+          subtext={field.description}
+          optional={field.optional}
+          type={field.type}
+          label={field.label}
+          name={field.name}
+        />
+      )}
+    </div>
+  );
+
  return (
    <>
      <h2 className="text-2xl font-bold text-text-800">{config.description}</h2>
@ -38,62 +94,17 @@ const DynamicConnectionForm: FC<DynamicConnectionFormProps> = ({
        name={"name"}
      />

-      {config.values.map((field) => {
-        if (!field.hidden) {
-          return (
-            <div key={field.name}>
-              {field.type == "file" ? (
-                <FileUpload
-                  name={field.name}
-                  selectedFiles={selectedFiles}
-                  setSelectedFiles={setSelectedFiles}
-                />
-              ) : field.type == "zip" ? (
-                <FileInput
-                  name={field.name}
-                  label={field.label}
-                  optional={field.optional}
-                  description={field.description}
-                  selectedFiles={selectedFiles}
-                  setSelectedFiles={setSelectedFiles}
-                />
-              ) : field.type === "list" ? (
-                <ListInput field={field} />
-              ) : field.type === "select" ? (
-                <SelectInput
-                  name={field.name}
-                  optional={field.optional}
-                  description={field.description}
-                  options={field.options || []}
-                  label={field.label}
-                />
-              ) : field.type === "number" ? (
-                <NumberInput
-                  label={field.label}
-                  optional={field.optional}
-                  description={field.description}
-                  name={field.name}
-                />
-              ) : field.type === "checkbox" ? (
-                <AdminBooleanFormField
-                  checked={values[field.name]}
-                  subtext={field.description}
-                  name={field.name}
-                  label={field.label}
-                />
-              ) : (
-                <TextFormField
-                  subtext={field.description}
-                  optional={field.optional}
-                  type={field.type}
-                  label={field.label}
-                  name={field.name}
-                />
-              )}
-            </div>
-          );
-        }
-      })}
+      {config.values.map((field) => !field.hidden && renderField(field))}
+
+      {config.advanced_values.length > 0 && (
+        <>
+          <AdvancedOptionsToggle
+            showAdvancedOptions={showAdvancedOptions}
+            setShowAdvancedOptions={setShowAdvancedOptions}
+          />
+          {showAdvancedOptions && config.advanced_values.map(renderField)}
+        </>
+      )}
    </>
  );
 };
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@ -86,6 +86,15 @@ export interface ConnectionConfiguration {
    | FileOption
    | ZipOption
  )[];
+  advanced_values: (
+    | BooleanOption
+    | ListOption
+    | TextOption
+    | NumberOption
+    | SelectOption
+    | FileOption
+    | ZipOption
+  )[];
  overrideDefaultFreq?: number;
 }

@ -116,6 +125,17 @@ export const connectorConfigs: Record<
        ],
      },
    ],
+    advanced_values: [
+      {
+        type: "number",
+        query: "Enter the maximum depth to crawl:",
+        label: "Max Depth",
+        name: "max_depth",
+        optional: true,
+        description:
+          "The maximum depth to crawl from the base URL. Default is 2.",
+      },
+    ],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  github: {
@ -152,6 +172,7 @@ export const connectorConfigs: Record<
        optional: true,
      },
    ],
+    advanced_values: [],
  },
  gitlab: {
    description: "Configure GitLab connector",
@ -187,6 +208,7 @@ export const connectorConfigs: Record<
        hidden: true,
      },
    ],
+    advanced_values: [],
  },
  google_drive: {
    description: "Configure Google Drive connector",
@ -223,22 +245,21 @@ export const connectorConfigs: Record<
        default: false,
      },
    ],
+    advanced_values: [],
  },
  gmail: {
    description: "Configure Gmail connector",
    values: [],
+    advanced_values: [],
  },
  bookstack: {
    description: "Configure Bookstack connector",
    values: [],
+    advanced_values: [],
  },
  confluence: {
    description: "Configure Confluence connector",
-    subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed.
-
-For example, entering "https://your-company.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https:///your-company.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space.
-
-Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`,
+    subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed. If no space is specified, all available Confluence spaces will be indexed.`,
    values: [
      {
        type: "text",
@ -254,9 +275,22 @@ Selecting the "Index Recursively" checkbox will index the specified page and all
        query: "Enter the space:",
        label: "Space",
        name: "space",
-        optional: false,
-        description: "The Confluence space name to index (e.g. `KB`)",
+        optional: true,
+        description:
+          "The Confluence space name to index (e.g. `KB`). If no space is specified, all available Confluence spaces will be indexed.",
      },
+      {
+        type: "checkbox",
+        query: "Is this a Confluence Cloud instance?",
+        label: "Is Cloud",
+        name: "is_cloud",
+        optional: false,
+        default: true,
+        description:
+          "Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
+      },
+    ],
+    advanced_values: [
      {
        type: "text",
        query: "Enter the page ID (optional):",
@ -276,14 +310,13 @@ Selecting the "Index Recursively" checkbox will index the specified page and all
        optional: false,
      },
      {
-        type: "checkbox",
-        query: "Is this a Confluence Cloud instance?",
-        label: "Is Cloud",
-        name: "is_cloud",
-        optional: false,
-        default: true,
+        type: "text",
+        query: "Enter the CQL query (optional):",
+        label: "CQL Query",
+        name: "cql_query",
+        optional: true,
        description:
-          "Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
+          "IMPORTANT: This will overwrite all other selected connector settings (besides Wiki Base URL). We currently only support CQL queries that return objects of type 'page'. This means all CQL queries must contain 'type=page' as the only type filter. We will still get all attachments and comments for the pages returned by the CQL query. Any 'lastmodified' filters will be overwritten. See https://developer.atlassian.com/server/confluence/advanced-searching-using-cql/ for more details.",
      },
    ],
  },
@ -308,6 +341,7 @@ Selecting the "Index Recursively" checkbox will index the specified page and all
        optional: true,
      },
    ],
+    advanced_values: [],
  },
  salesforce: {
    description: "Configure Salesforce connector",
@ -323,6 +357,7 @@ Selecting the "Index Recursively" checkbox will index the specified page and all
 Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of 'Opportunities').`,
      },
    ],
+    advanced_values: [],
  },
  sharepoint: {
    description: "Configure SharePoint connector",
@ -339,6 +374,7 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
 `,
      },
    ],
+    advanced_values: [],
  },
  teams: {
    description: "Configure Teams connector",
@ -352,6 +388,7 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
        description: `Specify 0 or more Teams to index. For example, specifying the Team 'Support' for the 'danswerai' Org will cause us to only index messages sent in channels belonging to the 'Support' Team. If no Teams are specified, all Teams in your organization will be indexed.`,
      },
    ],
+    advanced_values: [],
  },
  discourse: {
    description: "Configure Discourse connector",
@ -371,6 +408,7 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
        optional: true,
      },
    ],
+    advanced_values: [],
  },
  axero: {
    description: "Configure Axero connector",
@ -385,11 +423,13 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
          "Specify zero or more Spaces to index (by the Space IDs). If no Space IDs are specified, all Spaces will be indexed.",
      },
    ],
+    advanced_values: [],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  productboard: {
    description: "Configure Productboard connector",
    values: [],
+    advanced_values: [],
  },
  slack: {
    description: "Configure Slack connector",
@ -401,6 +441,8 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
        name: "workspace",
        optional: false,
      },
+    ],
+    advanced_values: [
      {
        type: "list",
        query: "Enter channels to include:",
@ -434,10 +476,12 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        description: `Specify the base URL for your Slab team. This will look something like: https://danswer.slab.com/`,
      },
    ],
+    advanced_values: [],
  },
  guru: {
    description: "Configure Guru connector",
    values: [],
+    advanced_values: [],
  },
  gong: {
    description: "Configure Gong connector",
@ -452,6 +496,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
          "Specify 0 or more workspaces to index. Provide the workspace ID or the EXACT workspace name from Gong. If no workspaces are specified, transcripts from all workspaces will be indexed.",
      },
    ],
+    advanced_values: [],
  },
  loopio: {
    description: "Configure Loopio connector",
@ -466,6 +511,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: true,
      },
    ],
+    advanced_values: [],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  file: {
@ -479,6 +525,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: false,
      },
    ],
+    advanced_values: [],
  },
  zulip: {
    description: "Configure Zulip connector",
@ -498,6 +545,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: false,
      },
    ],
+    advanced_values: [],
  },
  notion: {
    description: "Configure Notion connector",
@ -512,14 +560,17 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
          "If specified, will only index the specified page + all of its child pages. If left blank, will index all pages the integration has been given access to.",
      },
    ],
+    advanced_values: [],
  },
  requesttracker: {
    description: "Configure HubSpot connector",
    values: [],
+    advanced_values: [],
  },
  hubspot: {
    description: "Configure HubSpot connector",
    values: [],
+    advanced_values: [],
  },
  document360: {
    description: "Configure Document360 connector",
@ -541,6 +592,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
          "Specify 0 or more categories to index. For instance, specifying the category 'Help' will cause us to only index all content within the 'Help' category. If no categories are specified, all categories in your workspace will be indexed.",
      },
    ],
+    advanced_values: [],
  },
  clickup: {
    description: "Configure ClickUp connector",
@ -576,6 +628,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: false,
      },
    ],
+    advanced_values: [],
  },
  google_sites: {
    description: "Configure Google Sites connector",
@ -597,6 +650,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: false,
      },
    ],
+    advanced_values: [],
  },
  zendesk: {
    description: "Configure Zendesk connector",
@ -614,14 +668,17 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        default: "articles",
      },
    ],
+    advanced_values: [],
  },
  linear: {
-    description: "Configure Linear connector",
+    description: "Configure Dropbox connector",
    values: [],
+    advanced_values: [],
  },
  dropbox: {
    description: "Configure Dropbox connector",
    values: [],
+    advanced_values: [],
  },
  s3: {
    description: "Configure S3 connector",
@ -649,6 +706,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        hidden: true,
      },
    ],
+    advanced_values: [],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  r2: {
@ -677,6 +735,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        hidden: true,
      },
    ],
+    advanced_values: [],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  google_cloud_storage: {
@ -706,6 +765,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        hidden: true,
      },
    ],
+    advanced_values: [],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  oci_storage: {
@ -734,6 +794,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        hidden: true,
      },
    ],
+    advanced_values: [],
  },
  wikipedia: {
    description: "Configure Wikipedia connector",
@ -773,6 +834,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: false,
      },
    ],
+    advanced_values: [],
  },
  xenforo: {
    description: "Configure Xenforo connector",
@ -787,6 +849,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
          "The XenForo v2.2 forum URL to index. Can be board or thread.",
      },
    ],
+    advanced_values: [],
  },
  asana: {
    description: "Configure Asana connector",
@ -819,6 +882,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
          "ID of a team to use for accessing team-visible tasks. This allows indexing of team-visible tasks in addition to public tasks. Leave empty if you don't want to use this feature.",
      },
    ],
+    advanced_values: [],
  },
  mediawiki: {
    description: "Configure MediaWiki connector",
@ -866,6 +930,7 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
        optional: true,
      },
    ],
+    advanced_values: [],
  },
 };
 export function createConnectorInitialValues(
@ -987,10 +1052,11 @@ export interface BookstackConfig {}

 export interface ConfluenceConfig {
  wiki_base: string;
-  space: string;
+  space?: string;
  page_id?: string;
  is_cloud?: boolean;
  index_recursively?: boolean;
+  cql_query?: string;
 }

 export interface JiraConfig {