remove rt + home-grown sitemap parsing (#2933)

* remove rt

* nit

* add minor alembic revision

* functional migration

* replace usp

* k

* typing
This commit is contained in:
pablodanswer
2024-10-26 14:58:42 -07:00
committed by GitHub
parent aa0f307cc7
commit 088551a4ef
11 changed files with 261 additions and 221 deletions

View File

@@ -0,0 +1,74 @@
"""remove rt
Revision ID: 949b4a92a401
Revises: 1b10e1fda030
Create Date: 2024-10-26 13:06:06.937969
"""
from alembic import op
from sqlalchemy.orm import Session
# Import your models and constants
from danswer.db.models import (
Connector,
ConnectorCredentialPair,
Credential,
IndexAttempt,
)
from danswer.configs.constants import DocumentSource
# revision identifiers, used by Alembic.
revision = "949b4a92a401"
down_revision = "1b10e1fda030"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Deletes all RequestTracker connectors and associated data
bind = op.get_bind()
session = Session(bind=bind)
connectors_to_delete = (
session.query(Connector)
.filter(Connector.source == DocumentSource.REQUESTTRACKER)
.all()
)
connector_ids = [connector.id for connector in connectors_to_delete]
if connector_ids:
cc_pairs_to_delete = (
session.query(ConnectorCredentialPair)
.filter(ConnectorCredentialPair.connector_id.in_(connector_ids))
.all()
)
cc_pair_ids = [cc_pair.id for cc_pair in cc_pairs_to_delete]
if cc_pair_ids:
session.query(IndexAttempt).filter(
IndexAttempt.connector_credential_pair_id.in_(cc_pair_ids)
).delete(synchronize_session=False)
session.query(ConnectorCredentialPair).filter(
ConnectorCredentialPair.id.in_(cc_pair_ids)
).delete(synchronize_session=False)
credential_ids = [cc_pair.credential_id for cc_pair in cc_pairs_to_delete]
if credential_ids:
session.query(Credential).filter(Credential.id.in_(credential_ids)).delete(
synchronize_session=False
)
session.query(Connector).filter(Connector.id.in_(connector_ids)).delete(
synchronize_session=False
)
session.commit()
def downgrade() -> None:
# No-op downgrade as we cannot restore deleted data
pass

View File

@@ -34,7 +34,6 @@ from danswer.connectors.mediawiki.wiki import MediaWikiConnector
from danswer.connectors.models import InputType from danswer.connectors.models import InputType
from danswer.connectors.notion.connector import NotionConnector from danswer.connectors.notion.connector import NotionConnector
from danswer.connectors.productboard.connector import ProductboardConnector from danswer.connectors.productboard.connector import ProductboardConnector
from danswer.connectors.requesttracker.connector import RequestTrackerConnector
from danswer.connectors.salesforce.connector import SalesforceConnector from danswer.connectors.salesforce.connector import SalesforceConnector
from danswer.connectors.sharepoint.connector import SharepointConnector from danswer.connectors.sharepoint.connector import SharepointConnector
from danswer.connectors.slab.connector import SlabConnector from danswer.connectors.slab.connector import SlabConnector
@@ -77,7 +76,6 @@ def identify_connector_class(
DocumentSource.SLAB: SlabConnector, DocumentSource.SLAB: SlabConnector,
DocumentSource.NOTION: NotionConnector, DocumentSource.NOTION: NotionConnector,
DocumentSource.ZULIP: ZulipConnector, DocumentSource.ZULIP: ZulipConnector,
DocumentSource.REQUESTTRACKER: RequestTrackerConnector,
DocumentSource.GURU: GuruConnector, DocumentSource.GURU: GuruConnector,
DocumentSource.LINEAR: LinearConnector, DocumentSource.LINEAR: LinearConnector,
DocumentSource.HUBSPOT: HubSpotConnector, DocumentSource.HUBSPOT: HubSpotConnector,

View File

@@ -1,153 +1,124 @@
from datetime import datetime # from datetime import datetime
from datetime import timezone # from datetime import timezone
from logging import DEBUG as LOG_LVL_DEBUG # from logging import DEBUG as LOG_LVL_DEBUG
from typing import Any # from typing import Any
from typing import List # from typing import List
from typing import Optional # from typing import Optional
# from rt.rest1 import ALL_QUEUES
from rt.rest1 import ALL_QUEUES # from rt.rest1 import Rt
from rt.rest1 import Rt # from danswer.configs.app_configs import INDEX_BATCH_SIZE
# from danswer.configs.constants import DocumentSource
from danswer.configs.app_configs import INDEX_BATCH_SIZE # from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.configs.constants import DocumentSource # from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import GenerateDocumentsOutput # from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.interfaces import PollConnector # from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.interfaces import SecondsSinceUnixEpoch # from danswer.connectors.models import Document
from danswer.connectors.models import ConnectorMissingCredentialError # from danswer.connectors.models import Section
from danswer.connectors.models import Document # from danswer.utils.logger import setup_logger
from danswer.connectors.models import Section # logger = setup_logger()
from danswer.utils.logger import setup_logger # class RequestTrackerError(Exception):
# pass
logger = setup_logger() # class RequestTrackerConnector(PollConnector):
# def __init__(
# self,
class RequestTrackerError(Exception): # batch_size: int = INDEX_BATCH_SIZE,
pass # ) -> None:
# self.batch_size = batch_size
# def txn_link(self, tid: int, txn: int) -> str:
class RequestTrackerConnector(PollConnector): # return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}"
def __init__( # def build_doc_sections_from_txn(
self, # self, connection: Rt, ticket_id: int
batch_size: int = INDEX_BATCH_SIZE, # ) -> List[Section]:
) -> None: # Sections: List[Section] = []
self.batch_size = batch_size # get_history_resp = connection.get_history(ticket_id)
# if get_history_resp is None:
def txn_link(self, tid: int, txn: int) -> str: # raise RequestTrackerError(f"Ticket {ticket_id} cannot be found")
return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}" # for tx in get_history_resp:
# Sections.append(
def build_doc_sections_from_txn( # Section(
self, connection: Rt, ticket_id: int # link=self.txn_link(ticket_id, int(tx["id"])),
) -> List[Section]: # text="\n".join(
Sections: List[Section] = [] # [
# f"{k}:\n{v}\n" if k != "Attachments" else ""
get_history_resp = connection.get_history(ticket_id) # for (k, v) in tx.items()
# ]
if get_history_resp is None: # ),
raise RequestTrackerError(f"Ticket {ticket_id} cannot be found") # )
# )
for tx in get_history_resp: # return Sections
Sections.append( # def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
Section( # self.rt_username = credentials.get("requesttracker_username")
link=self.txn_link(ticket_id, int(tx["id"])), # self.rt_password = credentials.get("requesttracker_password")
text="\n".join( # self.rt_base_url = credentials.get("requesttracker_base_url")
[ # return None
f"{k}:\n{v}\n" if k != "Attachments" else "" # # This does not include RT file attachments yet.
for (k, v) in tx.items() # def _process_tickets(
] # self, start: datetime, end: datetime
), # ) -> GenerateDocumentsOutput:
) # if any([self.rt_username, self.rt_password, self.rt_base_url]) is None:
) # raise ConnectorMissingCredentialError("requesttracker")
return Sections # Rt0 = Rt(
# f"{self.rt_base_url}/REST/1.0/",
def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]: # self.rt_username,
self.rt_username = credentials.get("requesttracker_username") # self.rt_password,
self.rt_password = credentials.get("requesttracker_password") # )
self.rt_base_url = credentials.get("requesttracker_base_url") # Rt0.login()
return None # d0 = start.strftime("%Y-%m-%d %H:%M:%S")
# d1 = end.strftime("%Y-%m-%d %H:%M:%S")
# This does not include RT file attachments yet. # tickets = Rt0.search(
def _process_tickets( # Queue=ALL_QUEUES,
self, start: datetime, end: datetime # raw_query=f"Updated > '{d0}' AND Updated < '{d1}'",
) -> GenerateDocumentsOutput: # )
if any([self.rt_username, self.rt_password, self.rt_base_url]) is None: # doc_batch: List[Document] = []
raise ConnectorMissingCredentialError("requesttracker") # for ticket in tickets:
# ticket_keys_to_omit = ["id", "Subject"]
Rt0 = Rt( # tid: int = int(ticket["numerical_id"])
f"{self.rt_base_url}/REST/1.0/", # ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}"
self.rt_username, # logger.info(f"Processing ticket {tid}")
self.rt_password, # doc = Document(
) # id=ticket["id"],
# # Will add title to the first section later in processing
Rt0.login() # sections=[Section(link=ticketLink, text="")]
# + self.build_doc_sections_from_txn(Rt0, tid),
d0 = start.strftime("%Y-%m-%d %H:%M:%S") # source=DocumentSource.REQUESTTRACKER,
d1 = end.strftime("%Y-%m-%d %H:%M:%S") # semantic_identifier=ticket["Subject"],
# metadata={
tickets = Rt0.search( # key: value
Queue=ALL_QUEUES, # for key, value in ticket.items()
raw_query=f"Updated > '{d0}' AND Updated < '{d1}'", # if key not in ticket_keys_to_omit
) # },
# )
doc_batch: List[Document] = [] # doc_batch.append(doc)
# if len(doc_batch) >= self.batch_size:
for ticket in tickets: # yield doc_batch
ticket_keys_to_omit = ["id", "Subject"] # doc_batch = []
tid: int = int(ticket["numerical_id"]) # if doc_batch:
ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}" # yield doc_batch
logger.info(f"Processing ticket {tid}") # def poll_source(
doc = Document( # self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
id=ticket["id"], # ) -> GenerateDocumentsOutput:
# Will add title to the first section later in processing # # Keep query short, only look behind 1 day at maximum
sections=[Section(link=ticketLink, text="")] # one_day_ago: float = end - (24 * 60 * 60)
+ self.build_doc_sections_from_txn(Rt0, tid), # _start: float = start if start > one_day_ago else one_day_ago
source=DocumentSource.REQUESTTRACKER, # start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc)
semantic_identifier=ticket["Subject"], # end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
metadata={ # yield from self._process_tickets(start_datetime, end_datetime)
key: value # if __name__ == "__main__":
for key, value in ticket.items() # import time
if key not in ticket_keys_to_omit # import os
}, # from dotenv import load_dotenv
) # load_dotenv()
# logger.setLevel(LOG_LVL_DEBUG)
doc_batch.append(doc) # rt_connector = RequestTrackerConnector()
# rt_connector.load_credentials(
if len(doc_batch) >= self.batch_size: # {
yield doc_batch # "requesttracker_username": os.getenv("RT_USERNAME"),
doc_batch = [] # "requesttracker_password": os.getenv("RT_PASSWORD"),
# "requesttracker_base_url": os.getenv("RT_BASE_URL"),
if doc_batch: # }
yield doc_batch # )
# current = time.time()
def poll_source( # one_day_ago = current - (24 * 60 * 60) # 1 days
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch # latest_docs = rt_connector.poll_source(one_day_ago, current)
) -> GenerateDocumentsOutput: # for doc in latest_docs:
# Keep query short, only look behind 1 day at maximum # print(doc)
one_day_ago: float = end - (24 * 60 * 60)
_start: float = start if start > one_day_ago else one_day_ago
start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
yield from self._process_tickets(start_datetime, end_datetime)
if __name__ == "__main__":
import time
import os
from dotenv import load_dotenv
load_dotenv()
logger.setLevel(LOG_LVL_DEBUG)
rt_connector = RequestTrackerConnector()
rt_connector.load_credentials(
{
"requesttracker_username": os.getenv("RT_USERNAME"),
"requesttracker_password": os.getenv("RT_PASSWORD"),
"requesttracker_base_url": os.getenv("RT_BASE_URL"),
}
)
current = time.time()
one_day_ago = current - (24 * 60 * 60) # 1 days
latest_docs = rt_connector.poll_source(one_day_ago, current)
for doc in latest_docs:
print(doc)

View File

@@ -1,39 +1,78 @@
from datetime import datetime import re
from urllib import robotparser import xml.etree.ElementTree as ET
from typing import Set
from urllib.parse import urljoin
from usp.tree import sitemap_tree_for_homepage # type: ignore import requests
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool: def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
if not rp: """Extract sitemap URLs from robots.txt"""
return True sitemap_urls: set = set()
else: try:
return rp.can_fetch("*", url) robots_url = urljoin(base_url, "/robots.txt")
resp = requests.get(robots_url, timeout=10)
if resp.status_code == 200:
for line in resp.text.splitlines():
if line.lower().startswith("sitemap:"):
sitemap_url = line.split(":", 1)[1].strip()
sitemap_urls.add(sitemap_url)
except Exception as e:
logger.warning(f"Error fetching robots.txt: {e}")
return sitemap_urls
def init_robots_txt(site: str) -> robotparser.RobotFileParser: def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
ts = datetime.now().timestamp() """Extract URLs from a sitemap XML file"""
robots_url = f"{site}/robots.txt?ts={ts}" urls: set[str] = set()
rp = robotparser.RobotFileParser() try:
rp.set_url(robots_url) resp = requests.get(sitemap_url, timeout=10)
rp.read() if resp.status_code != 200:
return rp return urls
root = ET.fromstring(resp.content)
# Handle both regular sitemaps and sitemap indexes
# Remove namespace for easier parsing
namespace = re.match(r"\{.*\}", root.tag)
ns = namespace.group(0) if namespace else ""
if root.tag == f"{ns}sitemapindex":
# This is a sitemap index
for sitemap in root.findall(f".//{ns}loc"):
if sitemap.text:
sub_urls = _extract_urls_from_sitemap(sitemap.text)
urls.update(sub_urls)
else:
# This is a regular sitemap
for url in root.findall(f".//{ns}loc"):
if url.text:
urls.add(url.text)
except Exception as e:
logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
return urls
def list_pages_for_site(site: str) -> list[str]: def list_pages_for_site(site: str) -> list[str]:
rp: robotparser.RobotFileParser | None = None """Get list of pages from a site's sitemaps"""
try: site = site.rstrip("/")
rp = init_robots_txt(site) all_urls = set()
except Exception:
logger.warning("Failed to load robots.txt")
tree = sitemap_tree_for_homepage(site) # Try both common sitemap locations
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
for path in sitemap_paths:
sitemap_url = urljoin(site, path)
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)] # Check robots.txt for additional sitemaps
pages = list(dict.fromkeys(pages)) sitemap_locations = _get_sitemap_locations_from_robots(site)
for sitemap_url in sitemap_locations:
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
return pages return list(all_urls)

View File

@@ -61,7 +61,6 @@ requests==2.32.2
requests-oauthlib==1.3.1 requests-oauthlib==1.3.1
retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image
rfc3986==1.5.0 rfc3986==1.5.0
rt==3.1.2
simple-salesforce==1.12.6 simple-salesforce==1.12.6
slack-sdk==3.20.2 slack-sdk==3.20.2
SQLAlchemy[mypy]==2.0.15 SQLAlchemy[mypy]==2.0.15
@@ -79,7 +78,6 @@ asana==5.0.8
zenpy==2.0.41 zenpy==2.0.41
dropbox==11.36.2 dropbox==11.36.2
boto3-stubs[s3]==1.34.133 boto3-stubs[s3]==1.34.133
ultimate_sitemap_parser==0.5
stripe==10.12.0 stripe==10.12.0
urllib3==2.2.3 urllib3==2.2.3
mistune==0.8.4 mistune==0.8.4

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

View File

@@ -62,7 +62,6 @@ import OCIStorageSVG from "../../../public/OCI.svg";
import googleCloudStorageIcon from "../../../public/GoogleCloudStorage.png"; import googleCloudStorageIcon from "../../../public/GoogleCloudStorage.png";
import guruIcon from "../../../public/Guru.svg"; import guruIcon from "../../../public/Guru.svg";
import gongIcon from "../../../public/Gong.png"; import gongIcon from "../../../public/Gong.png";
import requestTrackerIcon from "../../../public/RequestTracker.png";
import zulipIcon from "../../../public/Zulip.png"; import zulipIcon from "../../../public/Zulip.png";
import linearIcon from "../../../public/Linear.png"; import linearIcon from "../../../public/Linear.png";
import hubSpotIcon from "../../../public/HubSpot.png"; import hubSpotIcon from "../../../public/HubSpot.png";
@@ -1178,13 +1177,6 @@ export const GuruIcon = ({
className = defaultTailwindCSS, className = defaultTailwindCSS,
}: IconProps) => <LogoIcon size={size} className={className} src={guruIcon} />; }: IconProps) => <LogoIcon size={size} className={className} src={guruIcon} />;
export const RequestTrackerIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src={requestTrackerIcon} />
);
export const SalesforceIcon = ({ export const SalesforceIcon = ({
size = 16, size = 16,
className = defaultTailwindCSS, className = defaultTailwindCSS,

View File

@@ -552,11 +552,6 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
], ],
advanced_values: [], advanced_values: [],
}, },
requesttracker: {
description: "Configure HubSpot connector",
values: [],
advanced_values: [],
},
hubspot: { hubspot: {
description: "Configure HubSpot connector", description: "Configure HubSpot connector",
values: [], values: [],
@@ -1116,8 +1111,6 @@ export interface NotionConfig {
export interface HubSpotConfig {} export interface HubSpotConfig {}
export interface RequestTrackerConfig {}
export interface Document360Config { export interface Document360Config {
workspace: string; workspace: string;
categories?: string[]; categories?: string[];

View File

@@ -106,12 +106,6 @@ export interface HubSpotCredentialJson {
hubspot_access_token: string; hubspot_access_token: string;
} }
export interface RequestTrackerCredentialJson {
requesttracker_username: string;
requesttracker_password: string;
requesttracker_base_url: string;
}
export interface Document360CredentialJson { export interface Document360CredentialJson {
portal_id: string; portal_id: string;
document360_api_token: string; document360_api_token: string;
@@ -224,11 +218,6 @@ export const credentialTemplates: Record<ValidSources, any> = {
portal_id: "", portal_id: "",
document360_api_token: "", document360_api_token: "",
} as Document360CredentialJson, } as Document360CredentialJson,
requesttracker: {
requesttracker_username: "",
requesttracker_password: "",
requesttracker_base_url: "",
} as RequestTrackerCredentialJson,
loopio: { loopio: {
loopio_subdomain: "", loopio_subdomain: "",
loopio_client_id: "", loopio_client_id: "",
@@ -371,12 +360,6 @@ export const credentialDisplayNames: Record<string, string> = {
// HubSpot // HubSpot
hubspot_access_token: "HubSpot Access Token", hubspot_access_token: "HubSpot Access Token",
// Request Tracker
requesttracker_username: "Request Tracker Username",
requesttracker_password: "Request Tracker Password",
requesttracker_base_url: "Request Tracker Base URL",
// Document360 // Document360
portal_id: "Document360 Portal ID", portal_id: "Document360 Portal ID",
document360_api_token: "Document360 API Token", document360_api_token: "Document360 API Token",

View File

@@ -21,7 +21,6 @@ import {
LoopioIcon, LoopioIcon,
NotionIcon, NotionIcon,
ProductboardIcon, ProductboardIcon,
RequestTrackerIcon,
R2Icon, R2Icon,
SalesforceIcon, SalesforceIcon,
SharepointIcon, SharepointIcon,
@@ -243,12 +242,6 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Wiki, category: SourceCategory.Wiki,
docs: "https://docs.danswer.dev/connectors/mediawiki", docs: "https://docs.danswer.dev/connectors/mediawiki",
}, },
requesttracker: {
icon: RequestTrackerIcon,
displayName: "Request Tracker",
category: SourceCategory.CustomerSupport,
docs: "https://docs.danswer.dev/connectors/requesttracker",
},
clickup: { clickup: {
icon: ClickupIcon, icon: ClickupIcon,
displayName: "Clickup", displayName: "Clickup",

View File

@@ -241,7 +241,6 @@ const validSources = [
"linear", "linear",
"hubspot", "hubspot",
"document360", "document360",
"requesttracker",
"file", "file",
"google_sites", "google_sites",
"loopio", "loopio",