mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-09 20:55:06 +02:00
Added support for Sites.Selected permissions to the SharePoint Connector and enabled the selection of individual subfolders (#1583)
This commit is contained in:
@@ -2,7 +2,8 @@ import io
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import Any, Optional
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import msal # type: ignore
|
||||
from office365.graph_client import GraphClient # type: ignore
|
||||
@@ -58,6 +59,13 @@ def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
|
||||
file_content = driveitem_object.get_content().execute_query().value
|
||||
return pptx_to_text(file=io.BytesIO(file_content))
|
||||
|
||||
@dataclass
|
||||
class SiteData:
|
||||
url: str
|
||||
folder: Optional[str]
|
||||
siteobjects: list = field(default_factory=list)
|
||||
driveitems: list = field(default_factory=list)
|
||||
|
||||
|
||||
class SharepointConnector(LoadConnector, PollConnector):
|
||||
def __init__(
|
||||
@@ -67,7 +75,19 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.graph_client: GraphClient | None = None
|
||||
self.requested_site_list: list[str] = sites
|
||||
self.site_data = self.extract_site_and_folder(sites)
|
||||
|
||||
@staticmethod
|
||||
def extract_site_and_folder(site_urls: list[str]) -> list[SiteData]:
|
||||
site_data_list = []
|
||||
for url in site_urls:
|
||||
parts = url.strip().split("/")
|
||||
if "sites" in parts:
|
||||
sites_index = parts.index("sites")
|
||||
site_url = "/".join(parts[:sites_index + 2])
|
||||
folder = parts[sites_index + 2] if len(parts) > sites_index + 2 else None
|
||||
site_data_list.append(SiteData(url=site_url, folder=folder, siteobjects=[], driveitems=[]))
|
||||
return site_data_list
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
aad_client_id = credentials["aad_client_id"]
|
||||
@@ -94,7 +114,6 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
|
||||
def get_all_driveitem_objects(
|
||||
self,
|
||||
site_object_list: list[Site],
|
||||
start: datetime | None = None,
|
||||
end: datetime | None = None,
|
||||
) -> list[DriveItem]:
|
||||
@@ -103,15 +122,24 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
|
||||
|
||||
driveitem_list = []
|
||||
for site_object in site_object_list:
|
||||
site_list_objects = site_object.lists.get().execute_query()
|
||||
for site_list_object in site_list_objects:
|
||||
for element in self.site_data:
|
||||
site_objects_list = []
|
||||
for site_object in element.siteobjects:
|
||||
site_objects_sublist = site_object.lists.get().execute_query()
|
||||
site_objects_list.extend(site_objects_sublist)
|
||||
|
||||
for site_object in site_objects_list:
|
||||
try:
|
||||
query = site_list_object.drive.root.get_files(True, 1000)
|
||||
query = site_object.drive.root.get_files(True, 1000)
|
||||
if filter_str:
|
||||
query = query.filter(filter_str)
|
||||
driveitems = query.execute_query()
|
||||
driveitem_list.extend(driveitems)
|
||||
if element.folder:
|
||||
filtered_driveitems = [item for item in driveitems if element.folder in item.parent_reference.path]
|
||||
element.driveitems.extend(filtered_driveitems)
|
||||
else:
|
||||
element.driveitems.extend(driveitems)
|
||||
|
||||
except Exception:
|
||||
# Sites include things that do not contain .drive.root so this fails
|
||||
# but this is fine, as there are no actually documents in those
|
||||
@@ -123,20 +151,12 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
if self.graph_client is None:
|
||||
raise ConnectorMissingCredentialError("Sharepoint")
|
||||
|
||||
site_object_list: list[Site] = []
|
||||
|
||||
sites_object = self.graph_client.sites.get().execute_query()
|
||||
|
||||
if len(self.requested_site_list) > 0:
|
||||
for requested_site in self.requested_site_list:
|
||||
adjusted_string = "/" + requested_site.replace(" ", "")
|
||||
for site_object in sites_object:
|
||||
if site_object.web_url.endswith(adjusted_string):
|
||||
site_object_list.append(site_object)
|
||||
if self.site_data:
|
||||
for element in self.site_data:
|
||||
element.siteobjects = [self.graph_client.sites.get_by_url(element.url).get().execute_query()]
|
||||
else:
|
||||
site_object_list.extend(sites_object)
|
||||
|
||||
return site_object_list
|
||||
site_objects = self.graph_client.sites.get().execute_query()
|
||||
self.site_data = [SiteData(url=None, folder=None, siteobjects=site_objects, driveitems=[])]
|
||||
|
||||
def _fetch_from_sharepoint(
|
||||
self, start: datetime | None = None, end: datetime | None = None
|
||||
@@ -144,28 +164,24 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
if self.graph_client is None:
|
||||
raise ConnectorMissingCredentialError("Sharepoint")
|
||||
|
||||
site_object_list = self.get_all_site_objects()
|
||||
|
||||
driveitem_list = self.get_all_driveitem_objects(
|
||||
site_object_list=site_object_list,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
self.get_all_site_objects()
|
||||
self.get_all_driveitem_objects(start=start, end=end)
|
||||
|
||||
# goes over all urls, converts them into Document objects and then yields them in batches
|
||||
doc_batch: list[Document] = []
|
||||
batch_count = 0
|
||||
for driveitem_object in driveitem_list:
|
||||
logger.debug(f"Processing: {driveitem_object.web_url}")
|
||||
doc_batch.append(
|
||||
self.convert_driveitem_object_to_document(driveitem_object)
|
||||
)
|
||||
for element in self.site_data:
|
||||
for driveitem_object in element.driveitems:
|
||||
logger.debug(f"Processing: {driveitem_object.web_url}")
|
||||
doc_batch.append(
|
||||
self.convert_driveitem_object_to_document(driveitem_object)
|
||||
)
|
||||
|
||||
batch_count += 1
|
||||
if batch_count >= self.batch_size:
|
||||
yield doc_batch
|
||||
batch_count = 0
|
||||
doc_batch = []
|
||||
batch_count += 1
|
||||
if batch_count >= self.batch_size:
|
||||
yield doc_batch
|
||||
batch_count = 0
|
||||
doc_batch = []
|
||||
yield doc_batch
|
||||
|
||||
def convert_driveitem_object_to_document(
|
||||
@@ -230,4 +246,4 @@ if __name__ == "__main__":
|
||||
}
|
||||
)
|
||||
document_batches = connector.load_from_state()
|
||||
print(next(document_batches))
|
||||
print(next(document_batches))
|
@@ -233,11 +233,16 @@ const MainSection = () => {
|
||||
formBodyBuilder={TextArrayFieldBuilder({
|
||||
name: "sites",
|
||||
label: "Sites:",
|
||||
subtext:
|
||||
"Specify 0 or more sites to index. For example, specifying the site " +
|
||||
"'support' for the 'danswerai' sharepoint will cause us to only index documents " +
|
||||
"within the 'https://danswerai.sharepoint.com/sites/support' site. " +
|
||||
"If no sites are specified, all sites in your organization will be indexed.",
|
||||
subtext: (
|
||||
<>
|
||||
<br />
|
||||
<ul>
|
||||
<li>• If no sites are specified, all sites in your organization will be indexed (Sites.Read.All permission required).</li>
|
||||
<li>• Specifying 'https://danswerai.sharepoint.com/sites/support' for example will only index documents within this site.</li>
|
||||
<li>• Specifying 'https://danswerai.sharepoint.com/sites/support/subfolder' for example will only index documents within this folder.</li>
|
||||
</ul>
|
||||
</>
|
||||
),
|
||||
})}
|
||||
validationSchema={Yup.object().shape({
|
||||
sites: Yup.array()
|
||||
|
@@ -214,7 +214,7 @@ export function TextArrayField<T extends Yup.AnyObject>({
|
||||
interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> {
|
||||
name: string;
|
||||
label: string;
|
||||
subtext?: string;
|
||||
subtext?: string | JSX.Element;
|
||||
type?: string;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user