Added support for Sites.Selected permissions to the SharePoint Connector and enabled the selection of individual subfolders (#1583)

This commit is contained in:
Nils
2024-06-08 00:47:17 +02:00
committed by GitHub
parent 4274c114c5
commit 7a3c102c74
3 changed files with 66 additions and 45 deletions

View File

@@ -2,7 +2,8 @@ import io
import os import os
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any, Optional
from dataclasses import dataclass, field
import msal # type: ignore import msal # type: ignore
from office365.graph_client import GraphClient # type: ignore from office365.graph_client import GraphClient # type: ignore
@@ -58,6 +59,13 @@ def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value file_content = driveitem_object.get_content().execute_query().value
return pptx_to_text(file=io.BytesIO(file_content)) return pptx_to_text(file=io.BytesIO(file_content))
@dataclass
class SiteData:
url: str
folder: Optional[str]
siteobjects: list = field(default_factory=list)
driveitems: list = field(default_factory=list)
class SharepointConnector(LoadConnector, PollConnector): class SharepointConnector(LoadConnector, PollConnector):
def __init__( def __init__(
@@ -67,7 +75,19 @@ class SharepointConnector(LoadConnector, PollConnector):
) -> None: ) -> None:
self.batch_size = batch_size self.batch_size = batch_size
self.graph_client: GraphClient | None = None self.graph_client: GraphClient | None = None
self.requested_site_list: list[str] = sites self.site_data = self.extract_site_and_folder(sites)
@staticmethod
def extract_site_and_folder(site_urls: list[str]) -> list[SiteData]:
site_data_list = []
for url in site_urls:
parts = url.strip().split("/")
if "sites" in parts:
sites_index = parts.index("sites")
site_url = "/".join(parts[:sites_index + 2])
folder = parts[sites_index + 2] if len(parts) > sites_index + 2 else None
site_data_list.append(SiteData(url=site_url, folder=folder, siteobjects=[], driveitems=[]))
return site_data_list
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
aad_client_id = credentials["aad_client_id"] aad_client_id = credentials["aad_client_id"]
@@ -94,7 +114,6 @@ class SharepointConnector(LoadConnector, PollConnector):
def get_all_driveitem_objects( def get_all_driveitem_objects(
self, self,
site_object_list: list[Site],
start: datetime | None = None, start: datetime | None = None,
end: datetime | None = None, end: datetime | None = None,
) -> list[DriveItem]: ) -> list[DriveItem]:
@@ -103,15 +122,24 @@ class SharepointConnector(LoadConnector, PollConnector):
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}" filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
driveitem_list = [] driveitem_list = []
for site_object in site_object_list: for element in self.site_data:
site_list_objects = site_object.lists.get().execute_query() site_objects_list = []
for site_list_object in site_list_objects: for site_object in element.siteobjects:
site_objects_sublist = site_object.lists.get().execute_query()
site_objects_list.extend(site_objects_sublist)
for site_object in site_objects_list:
try: try:
query = site_list_object.drive.root.get_files(True, 1000) query = site_object.drive.root.get_files(True, 1000)
if filter_str: if filter_str:
query = query.filter(filter_str) query = query.filter(filter_str)
driveitems = query.execute_query() driveitems = query.execute_query()
driveitem_list.extend(driveitems) if element.folder:
filtered_driveitems = [item for item in driveitems if element.folder in item.parent_reference.path]
element.driveitems.extend(filtered_driveitems)
else:
element.driveitems.extend(driveitems)
except Exception: except Exception:
# Sites include things that do not contain .drive.root so this fails # Sites include things that do not contain .drive.root so this fails
# but this is fine, as there are no actually documents in those # but this is fine, as there are no actually documents in those
@@ -123,20 +151,12 @@ class SharepointConnector(LoadConnector, PollConnector):
if self.graph_client is None: if self.graph_client is None:
raise ConnectorMissingCredentialError("Sharepoint") raise ConnectorMissingCredentialError("Sharepoint")
site_object_list: list[Site] = [] if self.site_data:
for element in self.site_data:
sites_object = self.graph_client.sites.get().execute_query() element.siteobjects = [self.graph_client.sites.get_by_url(element.url).get().execute_query()]
if len(self.requested_site_list) > 0:
for requested_site in self.requested_site_list:
adjusted_string = "/" + requested_site.replace(" ", "")
for site_object in sites_object:
if site_object.web_url.endswith(adjusted_string):
site_object_list.append(site_object)
else: else:
site_object_list.extend(sites_object) site_objects = self.graph_client.sites.get().execute_query()
self.site_data = [SiteData(url=None, folder=None, siteobjects=site_objects, driveitems=[])]
return site_object_list
def _fetch_from_sharepoint( def _fetch_from_sharepoint(
self, start: datetime | None = None, end: datetime | None = None self, start: datetime | None = None, end: datetime | None = None
@@ -144,18 +164,14 @@ class SharepointConnector(LoadConnector, PollConnector):
if self.graph_client is None: if self.graph_client is None:
raise ConnectorMissingCredentialError("Sharepoint") raise ConnectorMissingCredentialError("Sharepoint")
site_object_list = self.get_all_site_objects() self.get_all_site_objects()
self.get_all_driveitem_objects(start=start, end=end)
driveitem_list = self.get_all_driveitem_objects(
site_object_list=site_object_list,
start=start,
end=end,
)
# goes over all urls, converts them into Document objects and then yields them in batches # goes over all urls, converts them into Document objects and then yields them in batches
doc_batch: list[Document] = [] doc_batch: list[Document] = []
batch_count = 0 batch_count = 0
for driveitem_object in driveitem_list: for element in self.site_data:
for driveitem_object in element.driveitems:
logger.debug(f"Processing: {driveitem_object.web_url}") logger.debug(f"Processing: {driveitem_object.web_url}")
doc_batch.append( doc_batch.append(
self.convert_driveitem_object_to_document(driveitem_object) self.convert_driveitem_object_to_document(driveitem_object)

View File

@@ -233,11 +233,16 @@ const MainSection = () => {
formBodyBuilder={TextArrayFieldBuilder({ formBodyBuilder={TextArrayFieldBuilder({
name: "sites", name: "sites",
label: "Sites:", label: "Sites:",
subtext: subtext: (
"Specify 0 or more sites to index. For example, specifying the site " + <>
"'support' for the 'danswerai' sharepoint will cause us to only index documents " + <br />
"within the 'https://danswerai.sharepoint.com/sites/support' site. " + <ul>
"If no sites are specified, all sites in your organization will be indexed.", <li> If no sites are specified, all sites in your organization will be indexed (Sites.Read.All permission required).</li>
<li> Specifying &apos;https://danswerai.sharepoint.com/sites/support&apos; for example will only index documents within this site.</li>
<li> Specifying &apos;https://danswerai.sharepoint.com/sites/support/subfolder&apos; for example will only index documents within this folder.</li>
</ul>
</>
),
})} })}
validationSchema={Yup.object().shape({ validationSchema={Yup.object().shape({
sites: Yup.array() sites: Yup.array()

View File

@@ -214,7 +214,7 @@ export function TextArrayField<T extends Yup.AnyObject>({
interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> { interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> {
name: string; name: string;
label: string; label: string;
subtext?: string; subtext?: string | JSX.Element;
type?: string; type?: string;
} }