mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
DAN-136 Fix incorrect num of github docs (#112)
This commit is contained in:
parent
0cd18947ec
commit
4bddafe297
@ -87,18 +87,12 @@ def get_typesense_document_whitelists(
|
||||
def delete_typesense_doc_chunks(
|
||||
document_id: str, collection_name: str, ts_client: typesense.Client
|
||||
) -> bool:
|
||||
search_parameters = {
|
||||
"q": document_id,
|
||||
"query_by": DOCUMENT_ID,
|
||||
}
|
||||
doc_id_filter = {"filter_by": f"{DOCUMENT_ID}:'{document_id}'"}
|
||||
|
||||
# TODO consider race conditions if running multiple processes/threads
|
||||
hits = ts_client.collections[collection_name].documents.search(search_parameters)
|
||||
[
|
||||
ts_client.collections[collection_name].documents[hit["document"]["id"]].delete()
|
||||
for hit in hits["hits"]
|
||||
]
|
||||
return True if hits else False
|
||||
# Typesense doesn't seem to prioritize individual deletions, problem not seen with this approach
|
||||
# Point to consider if we see instances of number of Typesense and Qdrant docs not matching
|
||||
del_result = ts_client.collections[collection_name].documents.delete(doc_id_filter)
|
||||
return del_result["num_deleted"] != 0
|
||||
|
||||
|
||||
def index_typesense_chunks(
|
||||
|
@ -1,11 +1,9 @@
|
||||
from collections import defaultdict
|
||||
from typing import cast
|
||||
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.auth.users import current_user
|
||||
from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import OPENAI_API_KEY_STORAGE_KEY
|
||||
from danswer.connectors.file.utils import write_temp_files
|
||||
from danswer.connectors.google_drive.connector_auth import DB_CREDENTIALS_DICT_KEY
|
||||
@ -21,8 +19,6 @@ from danswer.db.connector import create_connector
|
||||
from danswer.db.connector import delete_connector
|
||||
from danswer.db.connector import fetch_connector_by_id
|
||||
from danswer.db.connector import fetch_connectors
|
||||
from danswer.db.connector import fetch_latest_index_attempt_by_connector
|
||||
from danswer.db.connector import fetch_latest_index_attempts_by_status
|
||||
from danswer.db.connector import get_connector_credential_ids
|
||||
from danswer.db.connector import update_connector
|
||||
from danswer.db.connector_credential_pair import add_credential_to_connector
|
||||
@ -37,9 +33,6 @@ from danswer.db.credentials import update_credential
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.engine import get_sqlalchemy_async_engine
|
||||
from danswer.db.index_attempt import create_index_attempt
|
||||
from danswer.db.models import Connector
|
||||
from danswer.db.models import IndexAttempt
|
||||
from danswer.db.models import IndexingStatus
|
||||
from danswer.db.models import User
|
||||
from danswer.direct_qa.key_validation import check_openai_api_key_is_valid
|
||||
from danswer.direct_qa.question_answer import get_openai_api_key
|
||||
@ -56,7 +49,6 @@ from danswer.server.models import CredentialSnapshot
|
||||
from danswer.server.models import FileUploadResponse
|
||||
from danswer.server.models import GDriveCallback
|
||||
from danswer.server.models import GoogleAppCredentials
|
||||
from danswer.server.models import IndexAttemptSnapshot
|
||||
from danswer.server.models import ObjectCreationIdResponse
|
||||
from danswer.server.models import RunConnectorRequest
|
||||
from danswer.server.models import StatusResponse
|
||||
|
@ -177,15 +177,5 @@ class CredentialSnapshot(CredentialBase):
|
||||
time_updated: datetime
|
||||
|
||||
|
||||
class IndexAttemptSnapshot(BaseModel):
|
||||
source: DocumentSource
|
||||
input_type: InputType
|
||||
status: IndexingStatus
|
||||
connector_specific_config: dict[str, Any]
|
||||
docs_indexed: int
|
||||
time_created: datetime
|
||||
time_updated: datetime
|
||||
|
||||
|
||||
class ApiKey(BaseModel):
|
||||
api_key: str
|
||||
|
@ -38,11 +38,11 @@ def _indexing_pipeline(
|
||||
net_doc_count_keyword = keyword_index.index(chunks, user_id)
|
||||
chunks_with_embeddings = embedder.embed(chunks)
|
||||
net_doc_count_vector = vector_index.index(chunks_with_embeddings, user_id)
|
||||
if net_doc_count_vector != net_doc_count_vector:
|
||||
logger.exception(
|
||||
"Document count change from keyword/vector indices don't align"
|
||||
)
|
||||
return max(net_doc_count_keyword, net_doc_count_vector)
|
||||
if net_doc_count_vector != net_doc_count_keyword:
|
||||
logger.warning("Document count change from keyword/vector indices don't align")
|
||||
net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
|
||||
logger.info(f"Indexed {net_new_docs} new documents")
|
||||
return net_new_docs
|
||||
|
||||
|
||||
def build_indexing_pipeline(
|
||||
|
Loading…
x
Reference in New Issue
Block a user