mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 23:40:58 +02:00
Improve egnyte connector (#3626)
This commit is contained in:
parent
717560872f
commit
eac73a1bf1
@ -224,7 +224,7 @@ class EgnyteConnector(LoadConnector, PollConnector, OAuthConnector):
|
|||||||
def _get_files_list(
|
def _get_files_list(
|
||||||
self,
|
self,
|
||||||
path: str,
|
path: str,
|
||||||
) -> list[dict[str, Any]]:
|
) -> Generator[dict[str, Any], None, None]:
|
||||||
if not self.access_token or not self.domain:
|
if not self.access_token or not self.domain:
|
||||||
raise ConnectorMissingCredentialError("Egnyte")
|
raise ConnectorMissingCredentialError("Egnyte")
|
||||||
|
|
||||||
@ -245,48 +245,46 @@ class EgnyteConnector(LoadConnector, PollConnector, OAuthConnector):
|
|||||||
raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}")
|
raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}")
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
all_files: list[dict[str, Any]] = []
|
|
||||||
|
|
||||||
# Add files from current directory
|
# Yield files from current directory
|
||||||
all_files.extend(data.get("files", []))
|
for file in data.get("files", []):
|
||||||
|
yield file
|
||||||
|
|
||||||
# Recursively traverse folders
|
# Recursively traverse folders
|
||||||
for item in data.get("folders", []):
|
for folder in data.get("folders", []):
|
||||||
all_files.extend(self._get_files_list(item["path"]))
|
yield from self._get_files_list(folder["path"])
|
||||||
|
|
||||||
return all_files
|
def _should_index_file(
|
||||||
|
|
||||||
def _filter_files(
|
|
||||||
self,
|
self,
|
||||||
files: list[dict[str, Any]],
|
file: dict[str, Any],
|
||||||
start_time: datetime | None = None,
|
start_time: datetime | None = None,
|
||||||
end_time: datetime | None = None,
|
end_time: datetime | None = None,
|
||||||
) -> list[dict[str, Any]]:
|
) -> bool:
|
||||||
filtered_files = []
|
"""Return True if file should be included based on filters."""
|
||||||
for file in files:
|
if file["is_folder"]:
|
||||||
if file["is_folder"]:
|
return False
|
||||||
continue
|
|
||||||
|
|
||||||
file_modified = _parse_last_modified(file["last_modified"])
|
file_modified = _parse_last_modified(file["last_modified"])
|
||||||
if start_time and file_modified < start_time:
|
if start_time and file_modified < start_time:
|
||||||
continue
|
return False
|
||||||
if end_time and file_modified > end_time:
|
if end_time and file_modified > end_time:
|
||||||
continue
|
return False
|
||||||
|
|
||||||
filtered_files.append(file)
|
return True
|
||||||
|
|
||||||
return filtered_files
|
|
||||||
|
|
||||||
def _process_files(
|
def _process_files(
|
||||||
self,
|
self,
|
||||||
start_time: datetime | None = None,
|
start_time: datetime | None = None,
|
||||||
end_time: datetime | None = None,
|
end_time: datetime | None = None,
|
||||||
) -> Generator[list[Document], None, None]:
|
) -> Generator[list[Document], None, None]:
|
||||||
files = self._get_files_list(self.folder_path)
|
|
||||||
files = self._filter_files(files, start_time, end_time)
|
|
||||||
|
|
||||||
current_batch: list[Document] = []
|
current_batch: list[Document] = []
|
||||||
for file in files:
|
|
||||||
|
# Iterate through yielded files and filter them
|
||||||
|
for file in self._get_files_list(self.folder_path):
|
||||||
|
if not self._should_index_file(file, start_time, end_time):
|
||||||
|
logger.debug(f"Skipping file '{file['path']}'.")
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Set up request with streaming enabled
|
# Set up request with streaming enabled
|
||||||
headers = {
|
headers = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user