mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-11 05:36:03 +02:00
Added support for Sites.Selected permissions to the SharePoint Connector and enabled the selection of individual subfolders (#1583)
This commit is contained in:
@@ -2,7 +2,8 @@ import io
|
|||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any, Optional
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
import msal # type: ignore
|
import msal # type: ignore
|
||||||
from office365.graph_client import GraphClient # type: ignore
|
from office365.graph_client import GraphClient # type: ignore
|
||||||
@@ -58,6 +59,13 @@ def get_text_from_pptx_driveitem(driveitem_object: DriveItem) -> str:
|
|||||||
file_content = driveitem_object.get_content().execute_query().value
|
file_content = driveitem_object.get_content().execute_query().value
|
||||||
return pptx_to_text(file=io.BytesIO(file_content))
|
return pptx_to_text(file=io.BytesIO(file_content))
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SiteData:
|
||||||
|
url: str
|
||||||
|
folder: Optional[str]
|
||||||
|
siteobjects: list = field(default_factory=list)
|
||||||
|
driveitems: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
class SharepointConnector(LoadConnector, PollConnector):
|
class SharepointConnector(LoadConnector, PollConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -67,7 +75,19 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.graph_client: GraphClient | None = None
|
self.graph_client: GraphClient | None = None
|
||||||
self.requested_site_list: list[str] = sites
|
self.site_data = self.extract_site_and_folder(sites)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_site_and_folder(site_urls: list[str]) -> list[SiteData]:
|
||||||
|
site_data_list = []
|
||||||
|
for url in site_urls:
|
||||||
|
parts = url.strip().split("/")
|
||||||
|
if "sites" in parts:
|
||||||
|
sites_index = parts.index("sites")
|
||||||
|
site_url = "/".join(parts[:sites_index + 2])
|
||||||
|
folder = parts[sites_index + 2] if len(parts) > sites_index + 2 else None
|
||||||
|
site_data_list.append(SiteData(url=site_url, folder=folder, siteobjects=[], driveitems=[]))
|
||||||
|
return site_data_list
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
aad_client_id = credentials["aad_client_id"]
|
aad_client_id = credentials["aad_client_id"]
|
||||||
@@ -94,7 +114,6 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
def get_all_driveitem_objects(
|
def get_all_driveitem_objects(
|
||||||
self,
|
self,
|
||||||
site_object_list: list[Site],
|
|
||||||
start: datetime | None = None,
|
start: datetime | None = None,
|
||||||
end: datetime | None = None,
|
end: datetime | None = None,
|
||||||
) -> list[DriveItem]:
|
) -> list[DriveItem]:
|
||||||
@@ -103,15 +122,24 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
|
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
|
||||||
|
|
||||||
driveitem_list = []
|
driveitem_list = []
|
||||||
for site_object in site_object_list:
|
for element in self.site_data:
|
||||||
site_list_objects = site_object.lists.get().execute_query()
|
site_objects_list = []
|
||||||
for site_list_object in site_list_objects:
|
for site_object in element.siteobjects:
|
||||||
|
site_objects_sublist = site_object.lists.get().execute_query()
|
||||||
|
site_objects_list.extend(site_objects_sublist)
|
||||||
|
|
||||||
|
for site_object in site_objects_list:
|
||||||
try:
|
try:
|
||||||
query = site_list_object.drive.root.get_files(True, 1000)
|
query = site_object.drive.root.get_files(True, 1000)
|
||||||
if filter_str:
|
if filter_str:
|
||||||
query = query.filter(filter_str)
|
query = query.filter(filter_str)
|
||||||
driveitems = query.execute_query()
|
driveitems = query.execute_query()
|
||||||
driveitem_list.extend(driveitems)
|
if element.folder:
|
||||||
|
filtered_driveitems = [item for item in driveitems if element.folder in item.parent_reference.path]
|
||||||
|
element.driveitems.extend(filtered_driveitems)
|
||||||
|
else:
|
||||||
|
element.driveitems.extend(driveitems)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# Sites include things that do not contain .drive.root so this fails
|
# Sites include things that do not contain .drive.root so this fails
|
||||||
# but this is fine, as there are no actually documents in those
|
# but this is fine, as there are no actually documents in those
|
||||||
@@ -123,20 +151,12 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
if self.graph_client is None:
|
if self.graph_client is None:
|
||||||
raise ConnectorMissingCredentialError("Sharepoint")
|
raise ConnectorMissingCredentialError("Sharepoint")
|
||||||
|
|
||||||
site_object_list: list[Site] = []
|
if self.site_data:
|
||||||
|
for element in self.site_data:
|
||||||
sites_object = self.graph_client.sites.get().execute_query()
|
element.siteobjects = [self.graph_client.sites.get_by_url(element.url).get().execute_query()]
|
||||||
|
|
||||||
if len(self.requested_site_list) > 0:
|
|
||||||
for requested_site in self.requested_site_list:
|
|
||||||
adjusted_string = "/" + requested_site.replace(" ", "")
|
|
||||||
for site_object in sites_object:
|
|
||||||
if site_object.web_url.endswith(adjusted_string):
|
|
||||||
site_object_list.append(site_object)
|
|
||||||
else:
|
else:
|
||||||
site_object_list.extend(sites_object)
|
site_objects = self.graph_client.sites.get().execute_query()
|
||||||
|
self.site_data = [SiteData(url=None, folder=None, siteobjects=site_objects, driveitems=[])]
|
||||||
return site_object_list
|
|
||||||
|
|
||||||
def _fetch_from_sharepoint(
|
def _fetch_from_sharepoint(
|
||||||
self, start: datetime | None = None, end: datetime | None = None
|
self, start: datetime | None = None, end: datetime | None = None
|
||||||
@@ -144,18 +164,14 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
if self.graph_client is None:
|
if self.graph_client is None:
|
||||||
raise ConnectorMissingCredentialError("Sharepoint")
|
raise ConnectorMissingCredentialError("Sharepoint")
|
||||||
|
|
||||||
site_object_list = self.get_all_site_objects()
|
self.get_all_site_objects()
|
||||||
|
self.get_all_driveitem_objects(start=start, end=end)
|
||||||
driveitem_list = self.get_all_driveitem_objects(
|
|
||||||
site_object_list=site_object_list,
|
|
||||||
start=start,
|
|
||||||
end=end,
|
|
||||||
)
|
|
||||||
|
|
||||||
# goes over all urls, converts them into Document objects and then yields them in batches
|
# goes over all urls, converts them into Document objects and then yields them in batches
|
||||||
doc_batch: list[Document] = []
|
doc_batch: list[Document] = []
|
||||||
batch_count = 0
|
batch_count = 0
|
||||||
for driveitem_object in driveitem_list:
|
for element in self.site_data:
|
||||||
|
for driveitem_object in element.driveitems:
|
||||||
logger.debug(f"Processing: {driveitem_object.web_url}")
|
logger.debug(f"Processing: {driveitem_object.web_url}")
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
self.convert_driveitem_object_to_document(driveitem_object)
|
self.convert_driveitem_object_to_document(driveitem_object)
|
||||||
|
@@ -233,11 +233,16 @@ const MainSection = () => {
|
|||||||
formBodyBuilder={TextArrayFieldBuilder({
|
formBodyBuilder={TextArrayFieldBuilder({
|
||||||
name: "sites",
|
name: "sites",
|
||||||
label: "Sites:",
|
label: "Sites:",
|
||||||
subtext:
|
subtext: (
|
||||||
"Specify 0 or more sites to index. For example, specifying the site " +
|
<>
|
||||||
"'support' for the 'danswerai' sharepoint will cause us to only index documents " +
|
<br />
|
||||||
"within the 'https://danswerai.sharepoint.com/sites/support' site. " +
|
<ul>
|
||||||
"If no sites are specified, all sites in your organization will be indexed.",
|
<li>• If no sites are specified, all sites in your organization will be indexed (Sites.Read.All permission required).</li>
|
||||||
|
<li>• Specifying 'https://danswerai.sharepoint.com/sites/support' for example will only index documents within this site.</li>
|
||||||
|
<li>• Specifying 'https://danswerai.sharepoint.com/sites/support/subfolder' for example will only index documents within this folder.</li>
|
||||||
|
</ul>
|
||||||
|
</>
|
||||||
|
),
|
||||||
})}
|
})}
|
||||||
validationSchema={Yup.object().shape({
|
validationSchema={Yup.object().shape({
|
||||||
sites: Yup.array()
|
sites: Yup.array()
|
||||||
|
@@ -214,7 +214,7 @@ export function TextArrayField<T extends Yup.AnyObject>({
|
|||||||
interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> {
|
interface TextArrayFieldBuilderProps<T extends Yup.AnyObject> {
|
||||||
name: string;
|
name: string;
|
||||||
label: string;
|
label: string;
|
||||||
subtext?: string;
|
subtext?: string | JSX.Element;
|
||||||
type?: string;
|
type?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user