Support to loop through all nested subcategories (#1382)

* Fix for parser failing if doc is blank

* Support to loop through all nested child categories
This commit is contained in:
Vikas Neha Ojha
2024-04-26 05:57:30 +05:30
committed by GitHub
parent ead7a80297
commit fe03747a1a
2 changed files with 25 additions and 8 deletions

View File

@@ -13,6 +13,7 @@ from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.document360.utils import flatten_child_categories
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@@ -97,13 +98,16 @@ class Document360Connector(LoadConnector, PollConnector):
{"id": article["id"], "category_name": category["name"]}
)
for child_category in category["child_categories"]:
for article in child_category["articles"]:
articles_with_category.append(
{
"id": article["id"],
"category_name": child_category["name"],
}
)
all_nested_categories = flatten_child_categories(child_category)
for nested_category in all_nested_categories:
for article in nested_category["articles"]:
articles_with_category.append(
{
"id": article["id"],
"category_name": nested_category["name"],
}
)
return articles_with_category
def _process_articles(
@@ -141,7 +145,9 @@ class Document360Connector(LoadConnector, PollConnector):
doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
html_content = article_details["html_content"]
article_content = parse_html_page_basic(html_content)
article_content = (
parse_html_page_basic(html_content) if html_content is not None else ""
)
doc_text = (
f"{article_details.get('description', '')}\n{article_content}".strip()
)

View File

@@ -0,0 +1,11 @@
from typing import List
def flatten_child_categories(category) -> List[dict]:
if not category["child_categories"]:
return [category]
else:
flattened_categories = [category]
for child_category in category["child_categories"]:
flattened_categories.extend(flatten_child_categories(child_category))
return flattened_categories