DAN-136 Fix incorrect num of github docs (#112)

This commit is contained in:
Yuhong Sun
2023-06-22 01:19:55 -07:00
committed by GitHub
parent 0cd18947ec
commit 4bddafe297
4 changed files with 10 additions and 34 deletions

View File

@@ -87,18 +87,12 @@ def get_typesense_document_whitelists(
def delete_typesense_doc_chunks( def delete_typesense_doc_chunks(
document_id: str, collection_name: str, ts_client: typesense.Client document_id: str, collection_name: str, ts_client: typesense.Client
) -> bool: ) -> bool:
search_parameters = { doc_id_filter = {"filter_by": f"{DOCUMENT_ID}:'{document_id}'"}
"q": document_id,
"query_by": DOCUMENT_ID,
}
# TODO consider race conditions if running multiple processes/threads # Typesense doesn't seem to prioritize individual deletions, problem not seen with this approach
hits = ts_client.collections[collection_name].documents.search(search_parameters) # Point to consider if we see instances of number of Typesense and Qdrant docs not matching
[ del_result = ts_client.collections[collection_name].documents.delete(doc_id_filter)
ts_client.collections[collection_name].documents[hit["document"]["id"]].delete() return del_result["num_deleted"] != 0
for hit in hits["hits"]
]
return True if hits else False
def index_typesense_chunks( def index_typesense_chunks(

View File

@@ -1,11 +1,9 @@
from collections import defaultdict
from typing import cast from typing import cast
from danswer.auth.schemas import UserRole from danswer.auth.schemas import UserRole
from danswer.auth.users import current_admin_user from danswer.auth.users import current_admin_user
from danswer.auth.users import current_user from danswer.auth.users import current_user
from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import OPENAI_API_KEY_STORAGE_KEY from danswer.configs.constants import OPENAI_API_KEY_STORAGE_KEY
from danswer.connectors.file.utils import write_temp_files from danswer.connectors.file.utils import write_temp_files
from danswer.connectors.google_drive.connector_auth import DB_CREDENTIALS_DICT_KEY from danswer.connectors.google_drive.connector_auth import DB_CREDENTIALS_DICT_KEY
@@ -21,8 +19,6 @@ from danswer.db.connector import create_connector
from danswer.db.connector import delete_connector from danswer.db.connector import delete_connector
from danswer.db.connector import fetch_connector_by_id from danswer.db.connector import fetch_connector_by_id
from danswer.db.connector import fetch_connectors from danswer.db.connector import fetch_connectors
from danswer.db.connector import fetch_latest_index_attempt_by_connector
from danswer.db.connector import fetch_latest_index_attempts_by_status
from danswer.db.connector import get_connector_credential_ids from danswer.db.connector import get_connector_credential_ids
from danswer.db.connector import update_connector from danswer.db.connector import update_connector
from danswer.db.connector_credential_pair import add_credential_to_connector from danswer.db.connector_credential_pair import add_credential_to_connector
@@ -37,9 +33,6 @@ from danswer.db.credentials import update_credential
from danswer.db.engine import get_session from danswer.db.engine import get_session
from danswer.db.engine import get_sqlalchemy_async_engine from danswer.db.engine import get_sqlalchemy_async_engine
from danswer.db.index_attempt import create_index_attempt from danswer.db.index_attempt import create_index_attempt
from danswer.db.models import Connector
from danswer.db.models import IndexAttempt
from danswer.db.models import IndexingStatus
from danswer.db.models import User from danswer.db.models import User
from danswer.direct_qa.key_validation import check_openai_api_key_is_valid from danswer.direct_qa.key_validation import check_openai_api_key_is_valid
from danswer.direct_qa.question_answer import get_openai_api_key from danswer.direct_qa.question_answer import get_openai_api_key
@@ -56,7 +49,6 @@ from danswer.server.models import CredentialSnapshot
from danswer.server.models import FileUploadResponse from danswer.server.models import FileUploadResponse
from danswer.server.models import GDriveCallback from danswer.server.models import GDriveCallback
from danswer.server.models import GoogleAppCredentials from danswer.server.models import GoogleAppCredentials
from danswer.server.models import IndexAttemptSnapshot
from danswer.server.models import ObjectCreationIdResponse from danswer.server.models import ObjectCreationIdResponse
from danswer.server.models import RunConnectorRequest from danswer.server.models import RunConnectorRequest
from danswer.server.models import StatusResponse from danswer.server.models import StatusResponse

View File

@@ -177,15 +177,5 @@ class CredentialSnapshot(CredentialBase):
time_updated: datetime time_updated: datetime
class IndexAttemptSnapshot(BaseModel):
source: DocumentSource
input_type: InputType
status: IndexingStatus
connector_specific_config: dict[str, Any]
docs_indexed: int
time_created: datetime
time_updated: datetime
class ApiKey(BaseModel): class ApiKey(BaseModel):
api_key: str api_key: str

View File

@@ -38,11 +38,11 @@ def _indexing_pipeline(
net_doc_count_keyword = keyword_index.index(chunks, user_id) net_doc_count_keyword = keyword_index.index(chunks, user_id)
chunks_with_embeddings = embedder.embed(chunks) chunks_with_embeddings = embedder.embed(chunks)
net_doc_count_vector = vector_index.index(chunks_with_embeddings, user_id) net_doc_count_vector = vector_index.index(chunks_with_embeddings, user_id)
if net_doc_count_vector != net_doc_count_vector: if net_doc_count_vector != net_doc_count_keyword:
logger.exception( logger.warning("Document count change from keyword/vector indices don't align")
"Document count change from keyword/vector indices don't align" net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)
) logger.info(f"Indexed {net_new_docs} new documents")
return max(net_doc_count_keyword, net_doc_count_vector) return net_new_docs
def build_indexing_pipeline( def build_indexing_pipeline(