diff --git a/backend/alembic/versions/d61e513bef0a_add_total_docs_for_index_attempt.py b/backend/alembic/versions/d61e513bef0a_add_total_docs_for_index_attempt.py new file mode 100644 index 000000000..7cc4bb639 --- /dev/null +++ b/backend/alembic/versions/d61e513bef0a_add_total_docs_for_index_attempt.py @@ -0,0 +1,32 @@ +"""Add Total Docs for Index Attempt + +Revision ID: d61e513bef0a +Revises: 46625e4745d4 +Create Date: 2023-10-27 23:02:43.369964 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "d61e513bef0a" +down_revision = "46625e4745d4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "index_attempt", + sa.Column("new_docs_indexed", sa.Integer(), nullable=True), + ) + op.alter_column( + "index_attempt", "num_docs_indexed", new_column_name="total_docs_indexed" + ) + + +def downgrade() -> None: + op.alter_column( + "index_attempt", "total_docs_indexed", new_column_name="num_docs_indexed" + ) + op.drop_column("index_attempt", "new_docs_indexed") diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py index 145e024d5..d680ec1bb 100755 --- a/backend/danswer/background/update.py +++ b/backend/danswer/background/update.py @@ -300,15 +300,18 @@ def _run_indexing( document_count += len(doc_batch) # commit transaction so that the `update` below begins - # with a brand new tracsaction. Postgres uses the start + # with a brand new transaction. Postgres uses the start # of the transactions when computing `NOW()`, so if we have # a long running transaction, the `time_updated` field will # be inaccurate db_session.commit() + + # This new value is updated every batch, so UI can refresh per batch update update_docs_indexed( db_session=db_session, index_attempt=attempt, - num_docs_indexed=document_count, + total_docs_indexed=document_count, + new_docs_indexed=net_doc_change, ) # check if connector is disabled mid run and stop if so diff --git a/backend/danswer/db/index_attempt.py b/backend/danswer/db/index_attempt.py index 4f21a25cb..ce67f84d4 100644 --- a/backend/danswer/db/index_attempt.py +++ b/backend/danswer/db/index_attempt.py @@ -90,9 +90,14 @@ def mark_attempt_failed( def update_docs_indexed( - db_session: Session, index_attempt: IndexAttempt, num_docs_indexed: int + db_session: Session, + index_attempt: IndexAttempt, + total_docs_indexed: int, + new_docs_indexed: int, ) -> None: - index_attempt.num_docs_indexed = num_docs_indexed + index_attempt.total_docs_indexed = total_docs_indexed + index_attempt.new_docs_indexed = new_docs_indexed + db_session.add(index_attempt) db_session.commit() diff --git a/backend/danswer/db/models.py b/backend/danswer/db/models.py index 7b9643189..e32829bfd 100644 --- a/backend/danswer/db/models.py +++ b/backend/danswer/db/models.py @@ -267,7 +267,8 @@ class IndexAttempt(Base): nullable=True, ) status: Mapped[IndexingStatus] = mapped_column(Enum(IndexingStatus)) - num_docs_indexed: Mapped[int | None] = mapped_column(Integer, default=0) + new_docs_indexed: Mapped[int | None] = mapped_column(Integer, default=0) + total_docs_indexed: Mapped[int | None] = mapped_column(Integer, default=0) error_msg: Mapped[str | None] = mapped_column( Text, default=None ) # only filled if status = "failed" diff --git a/backend/danswer/direct_qa/qa_utils.py b/backend/danswer/direct_qa/qa_utils.py index df4bdd6d5..51577c634 100644 --- a/backend/danswer/direct_qa/qa_utils.py +++ b/backend/danswer/direct_qa/qa_utils.py @@ -3,6 +3,7 @@ import math import re from collections.abc import Generator from collections.abc import Iterator +from json.decoder import JSONDecodeError from typing import cast from typing import Optional from typing import Tuple @@ -92,11 +93,18 @@ def separate_answer_quotes( try: model_raw_json = json.loads(answer_raw, strict=False) return extract_answer_quotes_json(model_raw_json) - except ValueError: - if is_json_prompt: - logger.error("Model did not output in json format as expected.") - raise - return extract_answer_quotes_freeform(answer_raw) + except JSONDecodeError: + # LLMs get confused when handling the list in the json. Sometimes it doesn't attend + # enough to the previous { token so it just ends the list of quotes and stops there + # here, we add logic to try to fix this LLM error. + try: + model_raw_json = json.loads(answer_raw + "}", strict=False) + return extract_answer_quotes_json(model_raw_json) + except JSONDecodeError: + if is_json_prompt: + logger.error("Model did not output in json format as expected.") + raise + return extract_answer_quotes_freeform(answer_raw) def match_quotes_to_docs( diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 5e808d9fa..6f89d86fc 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -277,6 +277,13 @@ def _index_vespa_chunks( if chunk_already_existed: already_existing_documents.add(chunk.source_document.id) + # In the logic below, we check if the chunk comes from a doc that has already been + # added to already_existing_document. This works because the chunks are ordered + # and because the Document chunks are not separated into different batches. + # The first chunk is processed first and if it exists, then its entire document + # is marked as already existing, so if the document length increases and new chunks + # are added, they must come last in processing and the doc would already be in + # already existing documents. insertion_records.add( DocumentInsertionRecord( document_id=chunk.source_document.id, diff --git a/backend/danswer/server/cc_pair/__init__.py b/backend/danswer/server/cc_pair/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/server/cc_pair/models.py b/backend/danswer/server/cc_pair/models.py index 4fd125684..cae06a39c 100644 --- a/backend/danswer/server/cc_pair/models.py +++ b/backend/danswer/server/cc_pair/models.py @@ -23,7 +23,7 @@ class CCPairFullInfo(BaseModel): cc_pair_model: ConnectorCredentialPair, index_attempt_models: list[IndexAttempt], latest_deletion_attempt: DeletionAttemptSnapshot | None, - num_docs_indexed: int, # not ideal, but this must be computed seperately + num_docs_indexed: int, # not ideal, but this must be computed separately ) -> "CCPairFullInfo": return cls( id=cc_pair_model.id, diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index f22ef3232..009e45205 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -311,7 +311,7 @@ class IndexAttemptRequest(BaseModel): class IndexAttemptSnapshot(BaseModel): id: int status: IndexingStatus | None - num_docs_indexed: int + new_docs_indexed: int error_msg: str | None time_started: str | None time_updated: str @@ -323,7 +323,7 @@ class IndexAttemptSnapshot(BaseModel): return IndexAttemptSnapshot( id=index_attempt.id, status=index_attempt.status, - num_docs_indexed=index_attempt.num_docs_indexed or 0, + new_docs_indexed=index_attempt.new_docs_indexed or 0, error_msg=index_attempt.error_msg, time_started=index_attempt.time_started.isoformat() if index_attempt.time_started diff --git a/web/src/app/admin/connector/[ccPairId]/IndexingAttemptsTable.tsx b/web/src/app/admin/connector/[ccPairId]/IndexingAttemptsTable.tsx index 9109f0669..b101dc4be 100644 --- a/web/src/app/admin/connector/[ccPairId]/IndexingAttemptsTable.tsx +++ b/web/src/app/admin/connector/[ccPairId]/IndexingAttemptsTable.tsx @@ -46,7 +46,7 @@ export function IndexingAttemptsTable({ ccPair }: { ccPair: CCPairFullInfo }) { size="xs" /> - {indexAttempt.num_docs_indexed} + {indexAttempt.new_docs_indexed} {indexAttempt.error_msg || "-"} diff --git a/web/src/app/admin/indexing/status/CCPairIndexingStatusTable.tsx b/web/src/app/admin/indexing/status/CCPairIndexingStatusTable.tsx index 451e4fd9d..39a30b14a 100644 --- a/web/src/app/admin/indexing/status/CCPairIndexingStatusTable.tsx +++ b/web/src/app/admin/indexing/status/CCPairIndexingStatusTable.tsx @@ -44,12 +44,12 @@ function CCPairIndexingStatusDisplay({ errorMsg={ccPairsIndexingStatus?.latest_index_attempt?.error_msg} size="xs" /> - {ccPairsIndexingStatus?.latest_index_attempt?.num_docs_indexed && + {ccPairsIndexingStatus?.latest_index_attempt?.new_docs_indexed && ccPairsIndexingStatus?.latest_index_attempt?.status === "in_progress" ? (
Current Run:{" "} - {ccPairsIndexingStatus.latest_index_attempt.num_docs_indexed} docs + {ccPairsIndexingStatus.latest_index_attempt.new_docs_indexed} docs indexed
diff --git a/web/src/lib/indexAttempt.ts b/web/src/lib/indexAttempt.ts index c5aa6a932..78fa4a379 100644 --- a/web/src/lib/indexAttempt.ts +++ b/web/src/lib/indexAttempt.ts @@ -7,7 +7,7 @@ export const getDocsProcessedPerMinute = ( !indexAttempt || !indexAttempt.time_started || !indexAttempt.time_updated || - indexAttempt.num_docs_indexed === 0 + indexAttempt.new_docs_indexed === 0 ) { return null; } @@ -22,5 +22,5 @@ export const getDocsProcessedPerMinute = ( if (seconds < 10) { return null; } - return (indexAttempt.num_docs_indexed / seconds) * 60; + return (indexAttempt.new_docs_indexed / seconds) * 60; }; diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 2eaca8b6c..7e3afed7f 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -131,7 +131,7 @@ export interface GoogleSitesConfig { export interface IndexAttemptSnapshot { id: number; status: ValidStatuses | null; - num_docs_indexed: number; + new_docs_indexed: number; error_msg: string | null; time_started: string | null; time_updated: string;