remove rt + home-grown sitemap parsing (#2933)

* remove rt

* nit

* add minor alembic revision

* functional migration

* replace usp

* k

* typing
This commit is contained in:
pablodanswer 2024-10-26 14:58:42 -07:00 committed by GitHub
parent aa0f307cc7
commit 088551a4ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 261 additions and 221 deletions

View File

@ -0,0 +1,74 @@
"""remove rt
Revision ID: 949b4a92a401
Revises: 1b10e1fda030
Create Date: 2024-10-26 13:06:06.937969
"""
from alembic import op
from sqlalchemy.orm import Session
# Import your models and constants
from danswer.db.models import (
Connector,
ConnectorCredentialPair,
Credential,
IndexAttempt,
)
from danswer.configs.constants import DocumentSource
# revision identifiers, used by Alembic.
revision = "949b4a92a401"
down_revision = "1b10e1fda030"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Deletes all RequestTracker connectors and associated data
bind = op.get_bind()
session = Session(bind=bind)
connectors_to_delete = (
session.query(Connector)
.filter(Connector.source == DocumentSource.REQUESTTRACKER)
.all()
)
connector_ids = [connector.id for connector in connectors_to_delete]
if connector_ids:
cc_pairs_to_delete = (
session.query(ConnectorCredentialPair)
.filter(ConnectorCredentialPair.connector_id.in_(connector_ids))
.all()
)
cc_pair_ids = [cc_pair.id for cc_pair in cc_pairs_to_delete]
if cc_pair_ids:
session.query(IndexAttempt).filter(
IndexAttempt.connector_credential_pair_id.in_(cc_pair_ids)
).delete(synchronize_session=False)
session.query(ConnectorCredentialPair).filter(
ConnectorCredentialPair.id.in_(cc_pair_ids)
).delete(synchronize_session=False)
credential_ids = [cc_pair.credential_id for cc_pair in cc_pairs_to_delete]
if credential_ids:
session.query(Credential).filter(Credential.id.in_(credential_ids)).delete(
synchronize_session=False
)
session.query(Connector).filter(Connector.id.in_(connector_ids)).delete(
synchronize_session=False
)
session.commit()
def downgrade() -> None:
# No-op downgrade as we cannot restore deleted data
pass

View File

@ -34,7 +34,6 @@ from danswer.connectors.mediawiki.wiki import MediaWikiConnector
from danswer.connectors.models import InputType
from danswer.connectors.notion.connector import NotionConnector
from danswer.connectors.productboard.connector import ProductboardConnector
from danswer.connectors.requesttracker.connector import RequestTrackerConnector
from danswer.connectors.salesforce.connector import SalesforceConnector
from danswer.connectors.sharepoint.connector import SharepointConnector
from danswer.connectors.slab.connector import SlabConnector
@ -77,7 +76,6 @@ def identify_connector_class(
DocumentSource.SLAB: SlabConnector,
DocumentSource.NOTION: NotionConnector,
DocumentSource.ZULIP: ZulipConnector,
DocumentSource.REQUESTTRACKER: RequestTrackerConnector,
DocumentSource.GURU: GuruConnector,
DocumentSource.LINEAR: LinearConnector,
DocumentSource.HUBSPOT: HubSpotConnector,

View File

@ -1,153 +1,124 @@
from datetime import datetime
from datetime import timezone
from logging import DEBUG as LOG_LVL_DEBUG
from typing import Any
from typing import List
from typing import Optional
from rt.rest1 import ALL_QUEUES
from rt.rest1 import Rt
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
class RequestTrackerError(Exception):
pass
class RequestTrackerConnector(PollConnector):
def __init__(
self,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.batch_size = batch_size
def txn_link(self, tid: int, txn: int) -> str:
return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}"
def build_doc_sections_from_txn(
self, connection: Rt, ticket_id: int
) -> List[Section]:
Sections: List[Section] = []
get_history_resp = connection.get_history(ticket_id)
if get_history_resp is None:
raise RequestTrackerError(f"Ticket {ticket_id} cannot be found")
for tx in get_history_resp:
Sections.append(
Section(
link=self.txn_link(ticket_id, int(tx["id"])),
text="\n".join(
[
f"{k}:\n{v}\n" if k != "Attachments" else ""
for (k, v) in tx.items()
]
),
)
)
return Sections
def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
self.rt_username = credentials.get("requesttracker_username")
self.rt_password = credentials.get("requesttracker_password")
self.rt_base_url = credentials.get("requesttracker_base_url")
return None
# This does not include RT file attachments yet.
def _process_tickets(
self, start: datetime, end: datetime
) -> GenerateDocumentsOutput:
if any([self.rt_username, self.rt_password, self.rt_base_url]) is None:
raise ConnectorMissingCredentialError("requesttracker")
Rt0 = Rt(
f"{self.rt_base_url}/REST/1.0/",
self.rt_username,
self.rt_password,
)
Rt0.login()
d0 = start.strftime("%Y-%m-%d %H:%M:%S")
d1 = end.strftime("%Y-%m-%d %H:%M:%S")
tickets = Rt0.search(
Queue=ALL_QUEUES,
raw_query=f"Updated > '{d0}' AND Updated < '{d1}'",
)
doc_batch: List[Document] = []
for ticket in tickets:
ticket_keys_to_omit = ["id", "Subject"]
tid: int = int(ticket["numerical_id"])
ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}"
logger.info(f"Processing ticket {tid}")
doc = Document(
id=ticket["id"],
# Will add title to the first section later in processing
sections=[Section(link=ticketLink, text="")]
+ self.build_doc_sections_from_txn(Rt0, tid),
source=DocumentSource.REQUESTTRACKER,
semantic_identifier=ticket["Subject"],
metadata={
key: value
for key, value in ticket.items()
if key not in ticket_keys_to_omit
},
)
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
# Keep query short, only look behind 1 day at maximum
one_day_ago: float = end - (24 * 60 * 60)
_start: float = start if start > one_day_ago else one_day_ago
start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
yield from self._process_tickets(start_datetime, end_datetime)
if __name__ == "__main__":
import time
import os
from dotenv import load_dotenv
load_dotenv()
logger.setLevel(LOG_LVL_DEBUG)
rt_connector = RequestTrackerConnector()
rt_connector.load_credentials(
{
"requesttracker_username": os.getenv("RT_USERNAME"),
"requesttracker_password": os.getenv("RT_PASSWORD"),
"requesttracker_base_url": os.getenv("RT_BASE_URL"),
}
)
current = time.time()
one_day_ago = current - (24 * 60 * 60) # 1 days
latest_docs = rt_connector.poll_source(one_day_ago, current)
for doc in latest_docs:
print(doc)
# from datetime import datetime
# from datetime import timezone
# from logging import DEBUG as LOG_LVL_DEBUG
# from typing import Any
# from typing import List
# from typing import Optional
# from rt.rest1 import ALL_QUEUES
# from rt.rest1 import Rt
# from danswer.configs.app_configs import INDEX_BATCH_SIZE
# from danswer.configs.constants import DocumentSource
# from danswer.connectors.interfaces import GenerateDocumentsOutput
# from danswer.connectors.interfaces import PollConnector
# from danswer.connectors.interfaces import SecondsSinceUnixEpoch
# from danswer.connectors.models import ConnectorMissingCredentialError
# from danswer.connectors.models import Document
# from danswer.connectors.models import Section
# from danswer.utils.logger import setup_logger
# logger = setup_logger()
# class RequestTrackerError(Exception):
# pass
# class RequestTrackerConnector(PollConnector):
# def __init__(
# self,
# batch_size: int = INDEX_BATCH_SIZE,
# ) -> None:
# self.batch_size = batch_size
# def txn_link(self, tid: int, txn: int) -> str:
# return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}"
# def build_doc_sections_from_txn(
# self, connection: Rt, ticket_id: int
# ) -> List[Section]:
# Sections: List[Section] = []
# get_history_resp = connection.get_history(ticket_id)
# if get_history_resp is None:
# raise RequestTrackerError(f"Ticket {ticket_id} cannot be found")
# for tx in get_history_resp:
# Sections.append(
# Section(
# link=self.txn_link(ticket_id, int(tx["id"])),
# text="\n".join(
# [
# f"{k}:\n{v}\n" if k != "Attachments" else ""
# for (k, v) in tx.items()
# ]
# ),
# )
# )
# return Sections
# def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
# self.rt_username = credentials.get("requesttracker_username")
# self.rt_password = credentials.get("requesttracker_password")
# self.rt_base_url = credentials.get("requesttracker_base_url")
# return None
# # This does not include RT file attachments yet.
# def _process_tickets(
# self, start: datetime, end: datetime
# ) -> GenerateDocumentsOutput:
# if any([self.rt_username, self.rt_password, self.rt_base_url]) is None:
# raise ConnectorMissingCredentialError("requesttracker")
# Rt0 = Rt(
# f"{self.rt_base_url}/REST/1.0/",
# self.rt_username,
# self.rt_password,
# )
# Rt0.login()
# d0 = start.strftime("%Y-%m-%d %H:%M:%S")
# d1 = end.strftime("%Y-%m-%d %H:%M:%S")
# tickets = Rt0.search(
# Queue=ALL_QUEUES,
# raw_query=f"Updated > '{d0}' AND Updated < '{d1}'",
# )
# doc_batch: List[Document] = []
# for ticket in tickets:
# ticket_keys_to_omit = ["id", "Subject"]
# tid: int = int(ticket["numerical_id"])
# ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}"
# logger.info(f"Processing ticket {tid}")
# doc = Document(
# id=ticket["id"],
# # Will add title to the first section later in processing
# sections=[Section(link=ticketLink, text="")]
# + self.build_doc_sections_from_txn(Rt0, tid),
# source=DocumentSource.REQUESTTRACKER,
# semantic_identifier=ticket["Subject"],
# metadata={
# key: value
# for key, value in ticket.items()
# if key not in ticket_keys_to_omit
# },
# )
# doc_batch.append(doc)
# if len(doc_batch) >= self.batch_size:
# yield doc_batch
# doc_batch = []
# if doc_batch:
# yield doc_batch
# def poll_source(
# self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
# ) -> GenerateDocumentsOutput:
# # Keep query short, only look behind 1 day at maximum
# one_day_ago: float = end - (24 * 60 * 60)
# _start: float = start if start > one_day_ago else one_day_ago
# start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc)
# end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
# yield from self._process_tickets(start_datetime, end_datetime)
# if __name__ == "__main__":
# import time
# import os
# from dotenv import load_dotenv
# load_dotenv()
# logger.setLevel(LOG_LVL_DEBUG)
# rt_connector = RequestTrackerConnector()
# rt_connector.load_credentials(
# {
# "requesttracker_username": os.getenv("RT_USERNAME"),
# "requesttracker_password": os.getenv("RT_PASSWORD"),
# "requesttracker_base_url": os.getenv("RT_BASE_URL"),
# }
# )
# current = time.time()
# one_day_ago = current - (24 * 60 * 60) # 1 days
# latest_docs = rt_connector.poll_source(one_day_ago, current)
# for doc in latest_docs:
# print(doc)

View File

@ -1,39 +1,78 @@
from datetime import datetime
from urllib import robotparser
import re
import xml.etree.ElementTree as ET
from typing import Set
from urllib.parse import urljoin
from usp.tree import sitemap_tree_for_homepage # type: ignore
import requests
from danswer.utils.logger import setup_logger
logger = setup_logger()
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
if not rp:
return True
else:
return rp.can_fetch("*", url)
def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
"""Extract sitemap URLs from robots.txt"""
sitemap_urls: set = set()
try:
robots_url = urljoin(base_url, "/robots.txt")
resp = requests.get(robots_url, timeout=10)
if resp.status_code == 200:
for line in resp.text.splitlines():
if line.lower().startswith("sitemap:"):
sitemap_url = line.split(":", 1)[1].strip()
sitemap_urls.add(sitemap_url)
except Exception as e:
logger.warning(f"Error fetching robots.txt: {e}")
return sitemap_urls
def init_robots_txt(site: str) -> robotparser.RobotFileParser:
ts = datetime.now().timestamp()
robots_url = f"{site}/robots.txt?ts={ts}"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
"""Extract URLs from a sitemap XML file"""
urls: set[str] = set()
try:
resp = requests.get(sitemap_url, timeout=10)
if resp.status_code != 200:
return urls
root = ET.fromstring(resp.content)
# Handle both regular sitemaps and sitemap indexes
# Remove namespace for easier parsing
namespace = re.match(r"\{.*\}", root.tag)
ns = namespace.group(0) if namespace else ""
if root.tag == f"{ns}sitemapindex":
# This is a sitemap index
for sitemap in root.findall(f".//{ns}loc"):
if sitemap.text:
sub_urls = _extract_urls_from_sitemap(sitemap.text)
urls.update(sub_urls)
else:
# This is a regular sitemap
for url in root.findall(f".//{ns}loc"):
if url.text:
urls.add(url.text)
except Exception as e:
logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
return urls
def list_pages_for_site(site: str) -> list[str]:
rp: robotparser.RobotFileParser | None = None
try:
rp = init_robots_txt(site)
except Exception:
logger.warning("Failed to load robots.txt")
"""Get list of pages from a site's sitemaps"""
site = site.rstrip("/")
all_urls = set()
tree = sitemap_tree_for_homepage(site)
# Try both common sitemap locations
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
for path in sitemap_paths:
sitemap_url = urljoin(site, path)
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
pages = list(dict.fromkeys(pages))
# Check robots.txt for additional sitemaps
sitemap_locations = _get_sitemap_locations_from_robots(site)
for sitemap_url in sitemap_locations:
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
return pages
return list(all_urls)

View File

@ -61,7 +61,6 @@ requests==2.32.2
requests-oauthlib==1.3.1
retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image
rfc3986==1.5.0
rt==3.1.2
simple-salesforce==1.12.6
slack-sdk==3.20.2
SQLAlchemy[mypy]==2.0.15
@ -79,7 +78,6 @@ asana==5.0.8
zenpy==2.0.41
dropbox==11.36.2
boto3-stubs[s3]==1.34.133
ultimate_sitemap_parser==0.5
stripe==10.12.0
urllib3==2.2.3
mistune==0.8.4

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

View File

@ -62,7 +62,6 @@ import OCIStorageSVG from "../../../public/OCI.svg";
import googleCloudStorageIcon from "../../../public/GoogleCloudStorage.png";
import guruIcon from "../../../public/Guru.svg";
import gongIcon from "../../../public/Gong.png";
import requestTrackerIcon from "../../../public/RequestTracker.png";
import zulipIcon from "../../../public/Zulip.png";
import linearIcon from "../../../public/Linear.png";
import hubSpotIcon from "../../../public/HubSpot.png";
@ -1178,13 +1177,6 @@ export const GuruIcon = ({
className = defaultTailwindCSS,
}: IconProps) => <LogoIcon size={size} className={className} src={guruIcon} />;
export const RequestTrackerIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src={requestTrackerIcon} />
);
export const SalesforceIcon = ({
size = 16,
className = defaultTailwindCSS,

View File

@ -552,11 +552,6 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
],
advanced_values: [],
},
requesttracker: {
description: "Configure HubSpot connector",
values: [],
advanced_values: [],
},
hubspot: {
description: "Configure HubSpot connector",
values: [],
@ -1116,8 +1111,6 @@ export interface NotionConfig {
export interface HubSpotConfig {}
export interface RequestTrackerConfig {}
export interface Document360Config {
workspace: string;
categories?: string[];

View File

@ -106,12 +106,6 @@ export interface HubSpotCredentialJson {
hubspot_access_token: string;
}
export interface RequestTrackerCredentialJson {
requesttracker_username: string;
requesttracker_password: string;
requesttracker_base_url: string;
}
export interface Document360CredentialJson {
portal_id: string;
document360_api_token: string;
@ -224,11 +218,6 @@ export const credentialTemplates: Record<ValidSources, any> = {
portal_id: "",
document360_api_token: "",
} as Document360CredentialJson,
requesttracker: {
requesttracker_username: "",
requesttracker_password: "",
requesttracker_base_url: "",
} as RequestTrackerCredentialJson,
loopio: {
loopio_subdomain: "",
loopio_client_id: "",
@ -371,12 +360,6 @@ export const credentialDisplayNames: Record<string, string> = {
// HubSpot
hubspot_access_token: "HubSpot Access Token",
// Request Tracker
requesttracker_username: "Request Tracker Username",
requesttracker_password: "Request Tracker Password",
requesttracker_base_url: "Request Tracker Base URL",
// Document360
portal_id: "Document360 Portal ID",
document360_api_token: "Document360 API Token",

View File

@ -21,7 +21,6 @@ import {
LoopioIcon,
NotionIcon,
ProductboardIcon,
RequestTrackerIcon,
R2Icon,
SalesforceIcon,
SharepointIcon,
@ -243,12 +242,6 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Wiki,
docs: "https://docs.danswer.dev/connectors/mediawiki",
},
requesttracker: {
icon: RequestTrackerIcon,
displayName: "Request Tracker",
category: SourceCategory.CustomerSupport,
docs: "https://docs.danswer.dev/connectors/requesttracker",
},
clickup: {
icon: ClickupIcon,
displayName: "Clickup",

View File

@ -241,7 +241,6 @@ const validSources = [
"linear",
"hubspot",
"document360",
"requesttracker",
"file",
"google_sites",
"loopio",