Improve egnyte connector (#3626)

This commit is contained in:
Chris Weaver 2025-01-07 19:09:46 -08:00 committed by GitHub
parent 717560872f
commit eac73a1bf1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -224,7 +224,7 @@ class EgnyteConnector(LoadConnector, PollConnector, OAuthConnector):
def _get_files_list( def _get_files_list(
self, self,
path: str, path: str,
) -> list[dict[str, Any]]: ) -> Generator[dict[str, Any], None, None]:
if not self.access_token or not self.domain: if not self.access_token or not self.domain:
raise ConnectorMissingCredentialError("Egnyte") raise ConnectorMissingCredentialError("Egnyte")
@ -245,48 +245,46 @@ class EgnyteConnector(LoadConnector, PollConnector, OAuthConnector):
raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}") raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}")
data = response.json() data = response.json()
all_files: list[dict[str, Any]] = []
# Add files from current directory # Yield files from current directory
all_files.extend(data.get("files", [])) for file in data.get("files", []):
yield file
# Recursively traverse folders # Recursively traverse folders
for item in data.get("folders", []): for folder in data.get("folders", []):
all_files.extend(self._get_files_list(item["path"])) yield from self._get_files_list(folder["path"])
return all_files def _should_index_file(
def _filter_files(
self, self,
files: list[dict[str, Any]], file: dict[str, Any],
start_time: datetime | None = None, start_time: datetime | None = None,
end_time: datetime | None = None, end_time: datetime | None = None,
) -> list[dict[str, Any]]: ) -> bool:
filtered_files = [] """Return True if file should be included based on filters."""
for file in files: if file["is_folder"]:
if file["is_folder"]: return False
continue
file_modified = _parse_last_modified(file["last_modified"]) file_modified = _parse_last_modified(file["last_modified"])
if start_time and file_modified < start_time: if start_time and file_modified < start_time:
continue return False
if end_time and file_modified > end_time: if end_time and file_modified > end_time:
continue return False
filtered_files.append(file) return True
return filtered_files
def _process_files( def _process_files(
self, self,
start_time: datetime | None = None, start_time: datetime | None = None,
end_time: datetime | None = None, end_time: datetime | None = None,
) -> Generator[list[Document], None, None]: ) -> Generator[list[Document], None, None]:
files = self._get_files_list(self.folder_path)
files = self._filter_files(files, start_time, end_time)
current_batch: list[Document] = [] current_batch: list[Document] = []
for file in files:
# Iterate through yielded files and filter them
for file in self._get_files_list(self.folder_path):
if not self._should_index_file(file, start_time, end_time):
logger.debug(f"Skipping file '{file['path']}'.")
continue
try: try:
# Set up request with streaming enabled # Set up request with streaming enabled
headers = { headers = {