Create connector

This commit is contained in:
Skylar Kesselring 2024-11-05 19:28:57 -05:00
parent 5f5cc9a724
commit 7ff18e0a93
12 changed files with 223 additions and 3 deletions

View File

@ -126,6 +126,7 @@ class DocumentSource(str, Enum):
XENFORO = "xenforo"
NOT_APPLICABLE = "not_applicable"
FRESHDESK = "freshdesk"
FIREFLIES = "fireflies"
DocumentSourceRequiringTenantContext: list[DocumentSource] = [DocumentSource.FILE]

View File

@ -16,6 +16,7 @@ from danswer.connectors.discourse.connector import DiscourseConnector
from danswer.connectors.document360.connector import Document360Connector
from danswer.connectors.dropbox.connector import DropboxConnector
from danswer.connectors.file.connector import LocalFileConnector
from danswer.connectors.fireflies.connector import FirefliesConnector
from danswer.connectors.freshdesk.connector import FreshdeskConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.gitlab.connector import GitlabConnector
@ -101,6 +102,7 @@ def identify_connector_class(
DocumentSource.OCI_STORAGE: BlobStorageConnector,
DocumentSource.XENFORO: XenforoConnector,
DocumentSource.FRESHDESK: FreshdeskConnector,
DocumentSource.FIREFLIES: FirefliesConnector,
}
connector_by_source = connector_map.get(source, {})

View File

@ -0,0 +1,179 @@
# TODO: Fix the transcript text parsing for the document
# TODO: Remove the host email from the secondary owners
# TODO: Figure out if to use semantic identifier or title
# TODO: Fix date parsing in graphql query
# TODO: Fix credentials loading
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import List
import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
_FIREFLIES_ID_PREFIX = "FIREFLIES_"
_FIREFLIES_API_URL = "https://api.fireflies.ai/graphql"
_FIREFLIES_API_HEADERS = {"Content-Type": "application/json", "Authorization": ""}
def _create_doc_from_transcript(transcript: dict) -> Document:
meeting_text = ""
sentences = transcript.get("sentences", [])
meeting_text = str(sentences)
# for sentence in sentences:
# meeting_text += (
# sentence.get("speaker_name", "Unknown Speaker")
# + ": "
# + sentence.get("text", "")
# + "\n\n"
# )
link = transcript.get("transcript_url", "")
id = _FIREFLIES_ID_PREFIX + transcript.get("id", "")
title = transcript.get("title", "")
meeting_date_unix = transcript.get("date", "")
meeting_date = datetime.fromtimestamp(meeting_date_unix / 1000, tz=timezone.utc)
meeting_host_email = [BasicExpertInfo(email=transcript.get("host_email", ""))]
meeting_participants_emails = []
for participant in transcript.get("participants", []):
meeting_participants_emails.append(BasicExpertInfo(email=participant))
return Document(
id=id,
sections=[
Section(
link=link,
text=meeting_text,
)
],
source=DocumentSource.FIREFLIES,
semantic_identifier=title,
metadata={},
doc_updated_at=meeting_date,
primary_owners=meeting_host_email,
secondary_owners=meeting_participants_emails,
)
class FirefliesConnector(PollConnector, LoadConnector):
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.batch_size = batch_size
def load_credentials(self, credentials: dict[str, str | int]) -> None:
api_key = credentials.get("fireflies_api_key")
if not isinstance(api_key, str):
raise ConnectorMissingCredentialError(
"The Fireflies API key must be a string"
)
self.api_key = str(api_key)
def _fetch_transcripts(
self, start: datetime | None = None, end: datetime | None = None
) -> Iterator[List[dict]]:
if self.api_key is None:
raise ConnectorMissingCredentialError("Missing API key")
headers = _FIREFLIES_API_HEADERS.copy()
headers["Authorization"] = "Bearer 790bc814-e2f8-4349-af78-2d0b5affdaa5"
limit = 4
skip = 0
date_filters = ""
if start:
date_filters = f"fromDate: {start.isoformat()},"
if end:
date_filters += f"toDate: {end.isoformat()}"
api_query = {
"query": f"""
query {{
transcripts(
limit: {limit},
skip: {skip}
) {{
title
id
date
host_email
participants
transcript_url
sentences {{
text
speaker_name
}}
}}
}}
"""
}
while True:
response = requests.post(
_FIREFLIES_API_URL, headers=headers, json=api_query
)
response.raise_for_status()
if response.status_code == 204:
break
transcripts = response.json().get("data", {}).get("transcripts", [])
if not transcripts:
break
yield transcripts
if len(transcripts) < limit:
break
skip += limit
def _process_transcripts(
self, start: datetime | None = None, end: datetime | None = None
) -> GenerateDocumentsOutput:
doc_batch: List[Document] = []
for transcript_batch in self._fetch_transcripts(start, end):
for transcript in transcript_batch:
print(transcript)
doc_batch.append(_create_doc_from_transcript(transcript))
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
def load_from_state(self) -> GenerateDocumentsOutput:
return self._process_transcripts()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
yield from self._process_transcripts(start_datetime, end_datetime)

View File

@ -28,7 +28,8 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
HF_CACHE_PATH = Path("/root/.cache/huggingface/")
TEMP_HF_CACHE_PATH = Path("/root/.cache/temp_huggingface/")
# Changed this to be compatible with Linux
TEMP_HF_CACHE_PATH = Path.home() / ".cache" / "temp_huggingface"
transformer_logging.set_verbosity_error()

BIN
web/public/Fireflies.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@ -75,9 +75,13 @@ async function handleRequest(request: NextRequest, path: string[]) {
backendUrl.searchParams.append(key, value);
});
// Added this to allow it to run properly on Linux
const headers = new Headers(request.headers);
headers.delete("connection");
const response = await fetch(backendUrl, {
method: request.method,
headers: request.headers,
headers: headers,
body: request.body,
signal: request.signal,
// @ts-ignore

View File

@ -75,6 +75,7 @@ import s3Icon from "../../../public/S3.png";
import r2Icon from "../../../public/r2.png";
import salesforceIcon from "../../../public/Salesforce.png";
import freshdeskIcon from "../../../public/Freshdesk.png";
import firefliesIcon from "../../../public/Fireflies.png";
import sharepointIcon from "../../../public/Sharepoint.png";
import teamsIcon from "../../../public/Teams.png";
@ -88,6 +89,7 @@ import voyageIcon from "../../../public/Voyage.png";
import googleIcon from "../../../public/Google.webp";
import xenforoIcon from "../../../public/Xenforo.svg";
import { FaRobot } from "react-icons/fa";
import { size } from "lodash";
export interface IconProps {
size?: number;
@ -1301,6 +1303,13 @@ export const FreshdeskIcon = ({
<LogoIcon size={size} className={className} src={freshdeskIcon} />
);
export const FirefliesIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src={firefliesIcon} />
);
/*
EE Icons
*/

View File

@ -950,7 +950,11 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
values: [],
advanced_values: [],
},
fireflies: {
description: "Configure Fireflies connector",
values: [],
advanced_values: [],
},
};
export function createConnectorInitialValues(
connector: ConfigurableSources
@ -1210,6 +1214,7 @@ export interface AsanaConfig {
export interface FreshdeskConfig {}
export interface FirefliesConfig {}
export interface MediaWikiConfig extends MediaWikiBaseConfig {
hostname: string;

View File

@ -187,6 +187,10 @@ export interface FreshdeskCredentialJson {
freshdesk_api_key: string;
}
export interface FirefliesCredentialJson {
fireflies_api_key: string;
}
export interface MediaWikiCredentialJson {}
export interface WikipediaCredentialJson extends MediaWikiCredentialJson {}
@ -290,6 +294,9 @@ export const credentialTemplates: Record<ValidSources, any> = {
freshdesk_password: "",
freshdesk_api_key: "",
} as FreshdeskCredentialJson,
fireflies: {
fireflies_api_key: "",
} as FirefliesCredentialJson,
xenforo: null,
google_sites: null,
file: null,
@ -435,7 +442,11 @@ export const credentialDisplayNames: Record<string, string> = {
freshdesk_domain: "Freshdesk Domain",
freshdesk_password: "Freshdesk Password",
freshdesk_api_key: "Freshdesk API Key",
// Fireflies
fireflies_api_key: "Fireflies API Key",
};
export function getDisplayNameForCredentialKey(key: string): string {
return credentialDisplayNames[key] || key;
}

View File

@ -37,6 +37,7 @@ import {
ColorSlackIcon,
XenforoIcon,
FreshdeskIcon,
FirefliesIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
@ -289,6 +290,12 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.CustomerSupport,
docs: "https://docs.danswer.dev/connectors/freshdesk",
},
fireflies: {
icon: FirefliesIcon,
displayName: "Fireflies",
category: SourceCategory.CustomerSupport,
docs: "https://docs.danswer.dev/connectors/fireflies",
},
// currently used for the Internet Search tool docs, which is why
// a globe is used
not_applicable: {

View File

@ -265,6 +265,7 @@ const validSources = [
"not_applicable",
"ingestion_api",
"freshdesk",
"fireflies",
] as const;
export type ValidSources = (typeof validSources)[number];