Remove DocumentSource Enum from postgres (#1217)

This commit is contained in:
Yuhong Sun 2024-03-14 18:19:40 -07:00 committed by GitHub
parent 2a8e53c94f
commit 4036e7c6c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 46 additions and 2 deletions

View File

@ -0,0 +1,38 @@
"""No Source Enum
Revision ID: e50154680a5c
Revises: fcd135795f21
Create Date: 2024-03-14 18:06:08.523106
"""
from alembic import op
import sqlalchemy as sa
from danswer.configs.constants import DocumentSource
# revision identifiers, used by Alembic.
revision = "e50154680a5c"
down_revision = "fcd135795f21"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.alter_column(
"search_doc",
"source_type",
type_=sa.String(length=50),
existing_type=sa.Enum(DocumentSource, native_enum=False),
existing_nullable=False,
)
op.execute("DROP TYPE IF EXISTS documentsource")
def downgrade() -> None:
op.alter_column(
"search_doc",
"source_type",
type_=sa.Enum(DocumentSource, native_enum=False),
existing_type=sa.String(length=50),
existing_nullable=False,
)

View File

@ -27,6 +27,7 @@ from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
@ -100,13 +101,18 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
return [_ensure_absolute_url(sitemap_url, loc_tag.text) for loc_tag in soup.find_all("loc")]
return [
_ensure_absolute_url(sitemap_url, loc_tag.text)
for loc_tag in soup.find_all("loc")
]
def _ensure_absolute_url(source_url:str, maybe_relative_url: str) -> str:
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
if not urlparse(maybe_relative_url).netloc:
return urljoin(source_url, maybe_relative_url)
return maybe_relative_url
def _ensure_valid_url(url: str) -> str:
if "://" not in url:
return "https://" + url