mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-08 21:50:12 +02:00
Fixed SharePoint connector polling (#3834)
* Fixed SharePoint connector polling * finish * fix sharepoint connector
This commit is contained in:
@ -127,13 +127,6 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
start: datetime | None = None,
|
start: datetime | None = None,
|
||||||
end: datetime | None = None,
|
end: datetime | None = None,
|
||||||
) -> list[tuple[DriveItem, str]]:
|
) -> list[tuple[DriveItem, str]]:
|
||||||
filter_str = ""
|
|
||||||
if start is not None and end is not None:
|
|
||||||
filter_str = (
|
|
||||||
f"last_modified_datetime ge {start.isoformat()} and "
|
|
||||||
f"last_modified_datetime le {end.isoformat()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
final_driveitems: list[tuple[DriveItem, str]] = []
|
final_driveitems: list[tuple[DriveItem, str]] = []
|
||||||
try:
|
try:
|
||||||
site = self.graph_client.sites.get_by_url(site_descriptor.url)
|
site = self.graph_client.sites.get_by_url(site_descriptor.url)
|
||||||
@ -167,9 +160,10 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
root_folder = root_folder.get_by_path(folder_part)
|
root_folder = root_folder.get_by_path(folder_part)
|
||||||
|
|
||||||
# Get all items recursively
|
# Get all items recursively
|
||||||
query = root_folder.get_files(True, 1000)
|
query = root_folder.get_files(
|
||||||
if filter_str:
|
recursive=True,
|
||||||
query = query.filter(filter_str)
|
page_size=1000,
|
||||||
|
)
|
||||||
driveitems = query.execute_query()
|
driveitems = query.execute_query()
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Found {len(driveitems)} items in drive '{drive.name}'"
|
f"Found {len(driveitems)} items in drive '{drive.name}'"
|
||||||
@ -180,11 +174,12 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
"Shared Documents" if drive.name == "Documents" else drive.name
|
"Shared Documents" if drive.name == "Documents" else drive.name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Filter items based on folder path if specified
|
||||||
if site_descriptor.folder_path:
|
if site_descriptor.folder_path:
|
||||||
# Filter items to ensure they're in the specified folder or its subfolders
|
# Filter items to ensure they're in the specified folder or its subfolders
|
||||||
# The path will be in format: /drives/{drive_id}/root:/folder/path
|
# The path will be in format: /drives/{drive_id}/root:/folder/path
|
||||||
filtered_driveitems = [
|
driveitems = [
|
||||||
(item, drive_name)
|
item
|
||||||
for item in driveitems
|
for item in driveitems
|
||||||
if any(
|
if any(
|
||||||
path_part == site_descriptor.folder_path
|
path_part == site_descriptor.folder_path
|
||||||
@ -196,7 +191,7 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
)[1].split("/")
|
)[1].split("/")
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
if len(filtered_driveitems) == 0:
|
if len(driveitems) == 0:
|
||||||
all_paths = [
|
all_paths = [
|
||||||
item.parent_reference.path for item in driveitems
|
item.parent_reference.path for item in driveitems
|
||||||
]
|
]
|
||||||
@ -204,11 +199,23 @@ class SharepointConnector(LoadConnector, PollConnector):
|
|||||||
f"Nothing found for folder '{site_descriptor.folder_path}' "
|
f"Nothing found for folder '{site_descriptor.folder_path}' "
|
||||||
f"in; any of valid paths: {all_paths}"
|
f"in; any of valid paths: {all_paths}"
|
||||||
)
|
)
|
||||||
final_driveitems.extend(filtered_driveitems)
|
|
||||||
else:
|
# Filter items based on time window if specified
|
||||||
final_driveitems.extend(
|
if start is not None and end is not None:
|
||||||
[(item, drive_name) for item in driveitems]
|
driveitems = [
|
||||||
|
item
|
||||||
|
for item in driveitems
|
||||||
|
if start
|
||||||
|
<= item.last_modified_datetime.replace(tzinfo=timezone.utc)
|
||||||
|
<= end
|
||||||
|
]
|
||||||
|
logger.debug(
|
||||||
|
f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
for item in driveitems:
|
||||||
|
final_driveitems.append((item, drive_name))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Some drives might not be accessible
|
# Some drives might not be accessible
|
||||||
logger.warning(f"Failed to process drive: {str(e)}")
|
logger.warning(f"Failed to process drive: {str(e)}")
|
||||||
|
@ -176,3 +176,35 @@ def test_sharepoint_connector_other_library(
|
|||||||
for expected in expected_documents:
|
for expected in expected_documents:
|
||||||
doc = find_document(found_documents, expected.semantic_identifier)
|
doc = find_document(found_documents, expected.semantic_identifier)
|
||||||
verify_document_content(doc, expected)
|
verify_document_content(doc, expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sharepoint_connector_poll(
|
||||||
|
mock_get_unstructured_api_key: MagicMock,
|
||||||
|
sharepoint_credentials: dict[str, str],
|
||||||
|
) -> None:
|
||||||
|
# Initialize connector with the base site URL
|
||||||
|
connector = SharepointConnector(
|
||||||
|
sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load credentials
|
||||||
|
connector.load_credentials(sharepoint_credentials)
|
||||||
|
|
||||||
|
# Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
|
||||||
|
start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc) # 12 seconds before
|
||||||
|
end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc) # 8 seconds after
|
||||||
|
|
||||||
|
# Get documents within the time window
|
||||||
|
document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
|
||||||
|
found_documents: list[Document] = [
|
||||||
|
doc for batch in document_batches for doc in batch
|
||||||
|
]
|
||||||
|
|
||||||
|
# Should only find test1.docx
|
||||||
|
assert len(found_documents) == 1, "Should only find one document in the time window"
|
||||||
|
doc = found_documents[0]
|
||||||
|
assert doc.semantic_identifier == "test1.docx"
|
||||||
|
verify_document_metadata(doc)
|
||||||
|
verify_document_content(
|
||||||
|
doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
|
||||||
|
)
|
||||||
|
Reference in New Issue
Block a user