DAN-136 Fix incorrect num of github docs (#112)

This commit is contained in:
Yuhong Sun 2023-06-22 01:19:55 -07:00 committed by GitHub
parent 0cd18947ec
commit 4bddafe297
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 34 deletions

View File

@ -87,18 +87,12 @@ def get_typesense_document_whitelists(
def delete_typesense_doc_chunks(
document_id: str, collection_name: str, ts_client: typesense.Client
) -> bool:
search_parameters = {
"q": document_id,
"query_by": DOCUMENT_ID,
}
doc_id_filter = {"filter_by": f"{DOCUMENT_ID}:'{document_id}'"}
# TODO consider race conditions if running multiple processes/threads
hits = ts_client.collections[collection_name].documents.search(search_parameters)
[
ts_client.collections[collection_name].documents[hit["document"]["id"]].delete()
for hit in hits["hits"]
]
return True if hits else False
# Typesense doesn't seem to prioritize individual deletions, problem not seen with this approach
# Point to consider if we see instances of number of Typesense and Qdrant docs not matching
del_result = ts_client.collections[collection_name].documents.delete(doc_id_filter)
return del_result["num_deleted"] != 0
def index_typesense_chunks(

View File

@ -1,11 +1,9 @@
from collections import defaultdict
from typing import cast
from danswer.auth.schemas import UserRole
from danswer.auth.users import current_admin_user
from danswer.auth.users import current_user
from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import OPENAI_API_KEY_STORAGE_KEY
from danswer.connectors.file.utils import write_temp_files
from danswer.connectors.google_drive.connector_auth import DB_CREDENTIALS_DICT_KEY
@ -21,8 +19,6 @@ from danswer.db.connector import create_connector
from danswer.db.connector import delete_connector
from danswer.db.connector import fetch_connector_by_id
from danswer.db.connector import fetch_connectors
from danswer.db.connector import fetch_latest_index_attempt_by_connector
from danswer.db.connector import fetch_latest_index_attempts_by_status
from danswer.db.connector import get_connector_credential_ids
from danswer.db.connector import update_connector
from danswer.db.connector_credential_pair import add_credential_to_connector
@ -37,9 +33,6 @@ from danswer.db.credentials import update_credential
from danswer.db.engine import get_session
from danswer.db.engine import get_sqlalchemy_async_engine
from danswer.db.index_attempt import create_index_attempt
from danswer.db.models import Connector
from danswer.db.models import IndexAttempt
from danswer.db.models import IndexingStatus
from danswer.db.models import User
from danswer.direct_qa.key_validation import check_openai_api_key_is_valid
from danswer.direct_qa.question_answer import get_openai_api_key
@ -56,7 +49,6 @@ from danswer.server.models import CredentialSnapshot
from danswer.server.models import FileUploadResponse
from danswer.server.models import GDriveCallback
from danswer.server.models import GoogleAppCredentials
from danswer.server.models import IndexAttemptSnapshot
from danswer.server.models import ObjectCreationIdResponse
from danswer.server.models import RunConnectorRequest
from danswer.server.models import StatusResponse

View File

@ -177,15 +177,5 @@ class CredentialSnapshot(CredentialBase):
time_updated: datetime
class IndexAttemptSnapshot(BaseModel):
source: DocumentSource
input_type: InputType
status: IndexingStatus
connector_specific_config: dict[str, Any]
docs_indexed: int
time_created: datetime
time_updated: datetime
class ApiKey(BaseModel):
api_key: str

View File

@ -38,11 +38,11 @@ def _indexing_pipeline(
net_doc_count_keyword = keyword_index.index(chunks, user_id)
chunks_with_embeddings = embedder.embed(chunks)
net_doc_count_vector = vector_index.index(chunks_with_embeddings, user_id)
if net_doc_count_vector != net_doc_count_vector:
logger.exception(
"Document count change from keyword/vector indices don't align"
)
return max(net_doc_count_keyword, net_doc_count_vector)
if net_doc_count_vector != net_doc_count_keyword:
logger.warning("Document count change from keyword/vector indices don't align")
net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
logger.info(f"Indexed {net_new_docs} new documents")
return net_new_docs
def build_indexing_pipeline(