Added support for Sites.Selected permissions to the SharePoint Connector and enabled the selection of individual subfolders (#1583)

This commit is contained in:
Nils
2024-06-08 00:47:17 +02:00
committed by GitHub
parent 4274c114c5
commit 7a3c102c74
3 changed files with 66 additions and 45 deletions

View File

@@ -2,7 +2,8 @@ import io
import os
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import Any, Optional
from dataclasses import dataclass, field
import msal # type: ignore
from office365.graph_client import GraphClient # type: ignore
@@ -58,6 +59,13 @@ def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
file_content = driveitem_object.get_content().execute_query().value
return pptx_to_text(file=io.BytesIO(file_content))
@dataclass
class SiteData:
url: str
folder: Optional[str]
siteobjects: list = field(default_factory=list)
driveitems: list = field(default_factory=list)
class SharepointConnector(LoadConnector, PollConnector):
def __init__(
@@ -67,7 +75,19 @@ class SharepointConnector(LoadConnector, PollConnector):
) -> None:
self.batch_size = batch_size
self.graph_client: GraphClient | None = None
self.requested_site_list: list[str] = sites
self.site_data = self.extract_site_and_folder(sites)
@staticmethod
def extract_site_and_folder(site_urls: list[str]) -> list[SiteData]:
site_data_list = []
for url in site_urls:
parts = url.strip().split("/")
if "sites" in parts:
sites_index = parts.index("sites")
site_url = "/".join(parts[:sites_index + 2])
folder = parts[sites_index + 2] if len(parts) > sites_index + 2 else None
site_data_list.append(SiteData(url=site_url, folder=folder, siteobjects=[], driveitems=[]))
return site_data_list
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
aad_client_id = credentials["aad_client_id"]
@@ -94,7 +114,6 @@ class SharepointConnector(LoadConnector, PollConnector):
def get_all_driveitem_objects(
self,
site_object_list: list[Site],
start: datetime | None = None,
end: datetime | None = None,
) -> list[DriveItem]:
@@ -103,15 +122,24 @@ class SharepointConnector(LoadConnector, PollConnector):
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
driveitem_list = []
for site_object in site_object_list:
site_list_objects = site_object.lists.get().execute_query()
for site_list_object in site_list_objects:
for element in self.site_data:
site_objects_list = []
for site_object in element.siteobjects:
site_objects_sublist = site_object.lists.get().execute_query()
site_objects_list.extend(site_objects_sublist)
for site_object in site_objects_list:
try:
query = site_list_object.drive.root.get_files(True, 1000)
query = site_object.drive.root.get_files(True, 1000)
if filter_str:
query = query.filter(filter_str)
driveitems = query.execute_query()
driveitem_list.extend(driveitems)
if element.folder:
filtered_driveitems = [item for item in driveitems if element.folder in item.parent_reference.path]
element.driveitems.extend(filtered_driveitems)
else:
element.driveitems.extend(driveitems)
except Exception:
# Sites include things that do not contain .drive.root so this fails
# but this is fine, as there are no actually documents in those
@@ -123,20 +151,12 @@ class SharepointConnector(LoadConnector, PollConnector):
if self.graph_client is None:
raise ConnectorMissingCredentialError("Sharepoint")
site_object_list: list[Site] = []
sites_object = self.graph_client.sites.get().execute_query()
if len(self.requested_site_list) > 0:
for requested_site in self.requested_site_list:
adjusted_string = "/" + requested_site.replace(" ", "")
for site_object in sites_object:
if site_object.web_url.endswith(adjusted_string):
site_object_list.append(site_object)
if self.site_data:
for element in self.site_data:
element.siteobjects = [self.graph_client.sites.get_by_url(element.url).get().execute_query()]
else:
site_object_list.extend(sites_object)
return site_object_list
site_objects = self.graph_client.sites.get().execute_query()
self.site_data = [SiteData(url=None, folder=None, siteobjects=site_objects, driveitems=[])]
def _fetch_from_sharepoint(
self, start: datetime | None = None, end: datetime | None = None
@@ -144,28 +164,24 @@ class SharepointConnector(LoadConnector, PollConnector):
if self.graph_client is None:
raise ConnectorMissingCredentialError("Sharepoint")
site_object_list = self.get_all_site_objects()
driveitem_list = self.get_all_driveitem_objects(
site_object_list=site_object_list,
start=start,
end=end,
)
self.get_all_site_objects()
self.get_all_driveitem_objects(start=start, end=end)
# goes over all urls, converts them into Document objects and then yields them in batches
doc_batch: list[Document] = []
batch_count = 0
for driveitem_object in driveitem_list:
logger.debug(f"Processing: {driveitem_object.web_url}")
doc_batch.append(
self.convert_driveitem_object_to_document(driveitem_object)
)
for element in self.site_data:
for driveitem_object in element.driveitems:
logger.debug(f"Processing: {driveitem_object.web_url}")
doc_batch.append(
self.convert_driveitem_object_to_document(driveitem_object)
)
batch_count += 1
if batch_count >= self.batch_size:
yield doc_batch
batch_count = 0
doc_batch = []
batch_count += 1
if batch_count >= self.batch_size:
yield doc_batch
batch_count = 0
doc_batch = []
yield doc_batch
def convert_driveitem_object_to_document(
@@ -230,4 +246,4 @@ if __name__ == "__main__":
}
)
document_batches = connector.load_from_state()
print(next(document_batches))
print(next(document_batches))

View File

@@ -233,11 +233,16 @@ const MainSection = () => {
formBodyBuilder={TextArrayFieldBuilder({
name: "sites",
label: "Sites:",
subtext:
"Specify 0 or more sites to index. For example, specifying the site " +
"'support' for the 'danswerai' sharepoint will cause us to only index documents " +
"within the 'https://danswerai.sharepoint.com/sites/support' site. " +
"If no sites are specified, all sites in your organization will be indexed.",
subtext: (
<>
<br />
<ul>
<li> If no sites are specified, all sites in your organization will be indexed (Sites.Read.All permission required).</li>
<li> Specifying &apos;https://danswerai.sharepoint.com/sites/support&apos; for example will only index documents within this site.</li>
<li> Specifying &apos;https://danswerai.sharepoint.com/sites/support/subfolder&apos; for example will only index documents within this folder.</li>
</ul>
</>
),
})}
validationSchema={Yup.object().shape({
sites: Yup.array()

View File

@@ -214,7 +214,7 @@ export function TextArrayField<T extends Yup.AnyObject>({
interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> {
name: string;
label: string;
subtext?: string;
subtext?: string | JSX.Element;
type?: string;
}