mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-29 22:37:43 +02:00
* refactored db.connector_credential_pair * Rerfactored the db.credentials user filtering * the restr
178 lines
7.1 KiB
Python
178 lines
7.1 KiB
Python
from simple_salesforce import Salesforce
|
|
from sqlalchemy.orm import Session
|
|
|
|
from onyx.connectors.salesforce.sqlite_functions import get_user_id_by_email
|
|
from onyx.connectors.salesforce.sqlite_functions import init_db
|
|
from onyx.connectors.salesforce.sqlite_functions import NULL_ID_STRING
|
|
from onyx.connectors.salesforce.sqlite_functions import update_email_to_id_table
|
|
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
|
from onyx.db.document import get_cc_pairs_for_document
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
_ANY_SALESFORCE_CLIENT: Salesforce | None = None
|
|
|
|
|
|
def get_any_salesforce_client_for_doc_id(
|
|
db_session: Session, doc_id: str
|
|
) -> Salesforce:
|
|
"""
|
|
We create a salesforce client for the first cc_pair for the first doc_id where
|
|
salesforce censoring is enabled. After that we just cache and reuse the same
|
|
client for all queries.
|
|
|
|
We do this to reduce the number of postgres queries we make at query time.
|
|
|
|
This may be problematic if they are using multiple cc_pairs for salesforce.
|
|
E.g. there are 2 different credential sets for 2 different salesforce cc_pairs
|
|
but only one has the permissions to access the permissions needed for the query.
|
|
"""
|
|
global _ANY_SALESFORCE_CLIENT
|
|
if _ANY_SALESFORCE_CLIENT is None:
|
|
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
|
|
first_cc_pair = cc_pairs[0]
|
|
credential_json = first_cc_pair.credential.credential_json
|
|
_ANY_SALESFORCE_CLIENT = Salesforce(
|
|
username=credential_json["sf_username"],
|
|
password=credential_json["sf_password"],
|
|
security_token=credential_json["sf_security_token"],
|
|
)
|
|
return _ANY_SALESFORCE_CLIENT
|
|
|
|
|
|
def _query_salesforce_user_id(sf_client: Salesforce, user_email: str) -> str | None:
|
|
query = f"SELECT Id FROM User WHERE Email = '{user_email}'"
|
|
result = sf_client.query(query)
|
|
if len(result["records"]) == 0:
|
|
return None
|
|
return result["records"][0]["Id"]
|
|
|
|
|
|
# This contains only the user_ids that we have found in Salesforce.
|
|
# If we don't know their user_id, we don't store anything in the cache.
|
|
_CACHED_SF_EMAIL_TO_ID_MAP: dict[str, str] = {}
|
|
|
|
|
|
def get_salesforce_user_id_from_email(
|
|
sf_client: Salesforce,
|
|
user_email: str,
|
|
) -> str | None:
|
|
"""
|
|
We cache this so we don't have to query Salesforce for every query and salesforce
|
|
user IDs never change.
|
|
Memory usage is fine because we just store 2 small strings per user.
|
|
|
|
If the email is not in the cache, we check the local salesforce database for the info.
|
|
If the user is not found in the local salesforce database, we query Salesforce.
|
|
Whatever we get back from Salesforce is added to the database.
|
|
If no user_id is found, we add a NULL_ID_STRING to the database for that email so
|
|
we don't query Salesforce again (which is slow) but we still check the local salesforce
|
|
database every query until a user id is found. This is acceptable because the query time
|
|
is quite fast.
|
|
If a user_id is created in Salesforce, it will be added to the local salesforce database
|
|
next time the connector is run. Then that value will be found in this function and cached.
|
|
|
|
NOTE: First time this runs, it may be slow if it hasn't already been updated in the local
|
|
salesforce database. (Around 0.1-0.3 seconds)
|
|
If it's cached or stored in the local salesforce database, it's fast (<0.001 seconds).
|
|
"""
|
|
global _CACHED_SF_EMAIL_TO_ID_MAP
|
|
if user_email in _CACHED_SF_EMAIL_TO_ID_MAP:
|
|
if _CACHED_SF_EMAIL_TO_ID_MAP[user_email] is not None:
|
|
return _CACHED_SF_EMAIL_TO_ID_MAP[user_email]
|
|
|
|
db_exists = True
|
|
try:
|
|
# Check if the user is already in the database
|
|
user_id = get_user_id_by_email(user_email)
|
|
except Exception:
|
|
init_db()
|
|
try:
|
|
user_id = get_user_id_by_email(user_email)
|
|
except Exception as e:
|
|
logger.error(f"Error checking if user is in database: {e}")
|
|
user_id = None
|
|
db_exists = False
|
|
|
|
# If no entry is found in the database (indicated by user_id being None)...
|
|
if user_id is None:
|
|
# ...query Salesforce and store the result in the database
|
|
user_id = _query_salesforce_user_id(sf_client, user_email)
|
|
if db_exists:
|
|
update_email_to_id_table(user_email, user_id)
|
|
return user_id
|
|
elif user_id is None:
|
|
return None
|
|
elif user_id == NULL_ID_STRING:
|
|
return None
|
|
# If the found user_id is real, cache it
|
|
_CACHED_SF_EMAIL_TO_ID_MAP[user_email] = user_id
|
|
return user_id
|
|
|
|
|
|
_MAX_RECORD_IDS_PER_QUERY = 200
|
|
|
|
|
|
def get_objects_access_for_user_id(
|
|
salesforce_client: Salesforce,
|
|
user_id: str,
|
|
record_ids: list[str],
|
|
) -> dict[str, bool]:
|
|
"""
|
|
Salesforce has a limit of 200 record ids per query. So we just truncate
|
|
the list of record ids to 200. We only ever retrieve 50 chunks at a time
|
|
so this should be fine (unlikely that we retrieve all 50 chunks contain
|
|
4 unique objects).
|
|
If we decide this isn't acceptable we can use multiple queries but they
|
|
should be in parallel so query time doesn't get too long.
|
|
"""
|
|
truncated_record_ids = record_ids[:_MAX_RECORD_IDS_PER_QUERY]
|
|
record_ids_str = "'" + "','".join(truncated_record_ids) + "'"
|
|
access_query = f"""
|
|
SELECT RecordId, HasReadAccess
|
|
FROM UserRecordAccess
|
|
WHERE RecordId IN ({record_ids_str})
|
|
AND UserId = '{user_id}'
|
|
"""
|
|
result = salesforce_client.query_all(access_query)
|
|
return {record["RecordId"]: record["HasReadAccess"] for record in result["records"]}
|
|
|
|
|
|
_CC_PAIR_ID_SALESFORCE_CLIENT_MAP: dict[int, Salesforce] = {}
|
|
_DOC_ID_TO_CC_PAIR_ID_MAP: dict[str, int] = {}
|
|
|
|
|
|
# NOTE: This is not used anywhere.
|
|
def _get_salesforce_client_for_doc_id(db_session: Session, doc_id: str) -> Salesforce:
|
|
"""
|
|
Uses a document id to get the cc_pair that indexed that document and uses the credentials
|
|
for that cc_pair to create a Salesforce client.
|
|
Problems:
|
|
- There may be multiple cc_pairs for a document, and we don't know which one to use.
|
|
- right now we just use the first one
|
|
- Building a new Salesforce client for each document is slow.
|
|
- Memory usage could be an issue as we build these dictionaries.
|
|
"""
|
|
if doc_id not in _DOC_ID_TO_CC_PAIR_ID_MAP:
|
|
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
|
|
first_cc_pair = cc_pairs[0]
|
|
_DOC_ID_TO_CC_PAIR_ID_MAP[doc_id] = first_cc_pair.id
|
|
|
|
cc_pair_id = _DOC_ID_TO_CC_PAIR_ID_MAP[doc_id]
|
|
if cc_pair_id not in _CC_PAIR_ID_SALESFORCE_CLIENT_MAP:
|
|
cc_pair = get_connector_credential_pair_from_id(
|
|
db_session=db_session,
|
|
cc_pair_id=cc_pair_id,
|
|
)
|
|
if cc_pair is None:
|
|
raise ValueError(f"CC pair {cc_pair_id} not found")
|
|
credential_json = cc_pair.credential.credential_json
|
|
_CC_PAIR_ID_SALESFORCE_CLIENT_MAP[cc_pair_id] = Salesforce(
|
|
username=credential_json["sf_username"],
|
|
password=credential_json["sf_password"],
|
|
security_token=credential_json["sf_security_token"],
|
|
)
|
|
|
|
return _CC_PAIR_ID_SALESFORCE_CLIENT_MAP[cc_pair_id]
|