From 5abf67fbf050d8922683c7da643289ccd22e500e Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Fri, 6 Sep 2024 14:21:24 -0700 Subject: [PATCH] PDF metadata + list defaults (#2341) * validate web list * update pdf extraction of metadat * remove pdf + log * stricter type enforcing * fix up indexing widths * minor formatting * add list case * check for empty metadata --- .../file_processing/extract_file_text.py | 18 ++- .../[connector]/AddConnectorPage.tsx | 2 +- .../admin/connectors/[connector]/Sidebar.tsx | 5 +- .../status/CCPairIndexingStatusTable.tsx | 115 ++++++------------ web/src/app/layout.tsx | 2 +- web/src/lib/connectors/connectors.ts | 2 - 6 files changed, 54 insertions(+), 90 deletions(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 3fcb2a99e..61abddf4d 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -201,22 +201,28 @@ def read_pdf_file( decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 except Exception: logger.error("Unable to decrypt pdf") - else: - logger.warning("No Password available to to decrypt pdf") if not decrypt_success: # By user request, keep files that are unreadable just so they # can be discoverable by title. return "", metadata + else: + logger.warning("No Password available to to decrypt pdf") # Extract metadata from the PDF, removing leading '/' from keys if present # This standardizes the metadata keys for consistency metadata = {} if pdf_reader.metadata is not None: - metadata = { - k[1:] if k.startswith("/") else k: v - for k, v in pdf_reader.metadata.items() - } + for key, value in pdf_reader.metadata.items(): + clean_key = key.lstrip("/") + if isinstance(value, str) and value.strip(): + metadata[clean_key] = value + + elif isinstance(value, list) and all( + isinstance(item, str) for item in value + ): + metadata[clean_key] = ", ".join(value) + return ( TEXT_SECTION_SEPARATOR.join( page.extract_text() for page in pdf_reader.pages diff --git a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx index f05c874d0..ebceacb08 100644 --- a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx +++ b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx @@ -95,7 +95,7 @@ export default function AddConnector({ ...configuration.values.reduce( (acc, field) => { if (field.type === "select") { - acc[field.name] = field.options ? field.options[field.default!]! : ""; + acc[field.name] = null; } else if (field.type === "list") { acc[field.name] = field.default || []; } else if (field.type === "checkbox") { diff --git a/web/src/app/admin/connectors/[connector]/Sidebar.tsx b/web/src/app/admin/connectors/[connector]/Sidebar.tsx index 5d678938d..e0c85029f 100644 --- a/web/src/app/admin/connectors/[connector]/Sidebar.tsx +++ b/web/src/app/admin/connectors/[connector]/Sidebar.tsx @@ -25,9 +25,10 @@ export default function Sidebar() { ]; return ( -
+
- +
{isOpen ? ( @@ -77,12 +67,12 @@ function SummaryRow({
- +
Total Connectors
{summary.count}
- +
Active Connectors
{isPaidEnterpriseFeaturesEnabled && ( - +
Public Connectors

{summary.public}/{summary.count} @@ -110,14 +100,14 @@ function SummaryRow({ )} - +

Total Docs Indexed
{summary.totalDocsIndexed.toLocaleString()}
- +
Errors
@@ -126,7 +116,7 @@ function SummaryRow({
- + ); } @@ -231,19 +221,17 @@ function ConnectorRow({ router.push(`/admin/connector/${ccPairsIndexingStatus.cc_pair_id}`); }} > - -

+ +

{ccPairsIndexingStatus.name}

- + {timeAgo(ccPairsIndexingStatus?.last_success) || "-"} - - {getActivityBadge()} - + {getActivityBadge()} {isPaidEnterpriseFeaturesEnabled && ( - + {ccPairsIndexingStatus.public_doc ? ( )} - - {ccPairsIndexingStatus.docs_indexed} - - + {ccPairsIndexingStatus.docs_indexed} + - + {isEditable && ( -
+
+ setSearchTerm(e.target.value)} + className="ml-1 w-96 h-9 flex-none rounded-md border border-border bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring" + /> + + +
-
- setSearchTerm(e.target.value)} - className="ml-1 w-96 h-9 flex-none rounded-md border border-border bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring" - /> - - -
{sortedSources .filter( (source) => @@ -494,41 +479,15 @@ export function CCPairIndexingStatusTable({ {connectorsToggled[source] && ( <> - - Name - - - Last Indexed - - - Activity - + Name + Last Indexed + Activity {isPaidEnterpriseFeaturesEnabled && ( - - Permissions - + Permissions )} - - Total Docs - - - Last Status - - + Total Docs + Last Status + {(sourceMatches ? groupedStatuses[source] diff --git a/web/src/app/layout.tsx b/web/src/app/layout.tsx index 8b8496c9f..4bf5c7722 100644 --- a/web/src/app/layout.tsx +++ b/web/src/app/layout.tsx @@ -104,7 +104,7 @@ export default async function RootLayout({