From 4036e7c6c6f8c2a87dfd1f79c70f78ce96f9d44f Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 14 Mar 2024 18:19:40 -0700 Subject: [PATCH] Remove DocumentSource Enum from postgres (#1217) --- .../versions/e50154680a5c_no_source_enum.py | 38 +++++++++++++++++++ backend/danswer/connectors/web/connector.py | 10 ++++- 2 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 backend/alembic/versions/e50154680a5c_no_source_enum.py diff --git a/backend/alembic/versions/e50154680a5c_no_source_enum.py b/backend/alembic/versions/e50154680a5c_no_source_enum.py new file mode 100644 index 000000000..83ee7186c --- /dev/null +++ b/backend/alembic/versions/e50154680a5c_no_source_enum.py @@ -0,0 +1,38 @@ +"""No Source Enum + +Revision ID: e50154680a5c +Revises: fcd135795f21 +Create Date: 2024-03-14 18:06:08.523106 + +""" +from alembic import op +import sqlalchemy as sa + +from danswer.configs.constants import DocumentSource + +# revision identifiers, used by Alembic. +revision = "e50154680a5c" +down_revision = "fcd135795f21" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.alter_column( + "search_doc", + "source_type", + type_=sa.String(length=50), + existing_type=sa.Enum(DocumentSource, native_enum=False), + existing_nullable=False, + ) + op.execute("DROP TYPE IF EXISTS documentsource") + + +def downgrade() -> None: + op.alter_column( + "search_doc", + "source_type", + type_=sa.Enum(DocumentSource, native_enum=False), + existing_type=sa.String(length=50), + existing_nullable=False, + ) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index da98df7ab..8acfaca42 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -27,6 +27,7 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger + logger = setup_logger() @@ -100,13 +101,18 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]: response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - return [_ensure_absolute_url(sitemap_url, loc_tag.text) for loc_tag in soup.find_all("loc")] + return [ + _ensure_absolute_url(sitemap_url, loc_tag.text) + for loc_tag in soup.find_all("loc") + ] -def _ensure_absolute_url(source_url:str, maybe_relative_url: str) -> str: + +def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str: if not urlparse(maybe_relative_url).netloc: return urljoin(source_url, maybe_relative_url) return maybe_relative_url + def _ensure_valid_url(url: str) -> str: if "://" not in url: return "https://" + url