From fe117513b06c7eebdea14da8e4bac44d8265d5e5 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sat, 28 Oct 2023 14:24:28 -0700 Subject: [PATCH] Reorganize and Cleanup for Hybrid Search (#643) --- backend/danswer/background/celery/celery.py | 6 +- .../danswer/background/connector_deletion.py | 4 +- backend/danswer/background/update.py | 4 +- backend/danswer/chat/chat_llm.py | 8 +- backend/danswer/chat/chat_prompts.py | 2 +- backend/danswer/datastores/document_index.py | 12 -- backend/danswer/datastores/vespa/__init__.py | 0 backend/danswer/db/document.py | 2 +- backend/danswer/db/feedback.py | 4 +- backend/danswer/direct_qa/answer_question.py | 10 +- backend/danswer/direct_qa/gpt_4_all.py | 2 +- backend/danswer/direct_qa/huggingface.py | 2 +- backend/danswer/direct_qa/interfaces.py | 2 +- .../danswer/direct_qa/local_transformers.py | 2 +- backend/danswer/direct_qa/open_ai.py | 2 +- backend/danswer/direct_qa/qa_block.py | 2 +- backend/danswer/direct_qa/qa_prompts.py | 2 +- backend/danswer/direct_qa/qa_utils.py | 2 +- backend/danswer/direct_qa/request_model.py | 2 +- backend/danswer/document_index/__init__.py | 7 + .../document_index_utils.py} | 4 +- .../interfaces.py | 4 +- .../vespa}/__init__.py | 0 .../vespa/app_config/schemas/danswer_chunk.sd | 9 +- .../vespa/app_config/services.xml | 0 .../vespa/index.py} | 20 +-- .../vespa/utils.py | 0 .../{datastores => indexing}/__init__.py | 0 .../chunk.py => indexing/chunker.py} | 4 +- backend/danswer/indexing/embedder.py | 77 +++++++++ .../indexing_pipeline.py | 16 +- .../danswer/{chunking => indexing}/models.py | 0 backend/danswer/main.py | 4 +- backend/danswer/search/danswer_helper.py | 8 +- backend/danswer/search/keyword_search.py | 79 --------- backend/danswer/search/models.py | 9 +- .../{search_utils.py => search_nlp_models.py} | 0 .../{semantic_search.py => search_runner.py} | 150 +++++++++--------- backend/danswer/server/models.py | 3 +- backend/danswer/server/search_backend.py | 87 ++++++++-- backend/danswer/utils/acl.py | 6 +- backend/scripts/reset_indexes.py | 2 +- backend/scripts/save_load_state.py | 2 +- .../unit/danswer/direct_qa/test_qa_utils.py | 2 +- 44 files changed, 308 insertions(+), 255 deletions(-) delete mode 100644 backend/danswer/datastores/document_index.py delete mode 100644 backend/danswer/datastores/vespa/__init__.py create mode 100644 backend/danswer/document_index/__init__.py rename backend/danswer/{datastores/datastore_utils.py => document_index/document_index_utils.py} (92%) rename backend/danswer/{datastores => document_index}/interfaces.py (97%) rename backend/danswer/{chunking => document_index/vespa}/__init__.py (100%) rename backend/danswer/{datastores => document_index}/vespa/app_config/schemas/danswer_chunk.sd (95%) rename backend/danswer/{datastores => document_index}/vespa/app_config/services.xml (100%) rename backend/danswer/{datastores/vespa/store.py => document_index/vespa/index.py} (97%) rename backend/danswer/{datastores => document_index}/vespa/utils.py (100%) rename backend/danswer/{datastores => indexing}/__init__.py (100%) rename backend/danswer/{chunking/chunk.py => indexing/chunker.py} (97%) create mode 100644 backend/danswer/indexing/embedder.py rename backend/danswer/{datastores => indexing}/indexing_pipeline.py (91%) rename backend/danswer/{chunking => indexing}/models.py (100%) delete mode 100644 backend/danswer/search/keyword_search.py rename backend/danswer/search/{search_utils.py => search_nlp_models.py} (100%) rename backend/danswer/search/{semantic_search.py => search_runner.py} (76%) diff --git a/backend/danswer/background/celery/celery.py b/backend/danswer/background/celery/celery.py index 4aa16a2f0d9..07d09afdc8d 100644 --- a/backend/danswer/background/celery/celery.py +++ b/backend/danswer/background/celery/celery.py @@ -13,9 +13,6 @@ from danswer.background.task_utils import name_document_set_sync_task from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH from danswer.configs.app_configs import JOB_TIMEOUT from danswer.connectors.file.utils import file_age_in_hours -from danswer.datastores.document_index import get_default_document_index -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import UpdateRequest from danswer.db.connector_credential_pair import get_connector_credential_pair from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed from danswer.db.document import prepare_to_modify_documents @@ -31,6 +28,9 @@ from danswer.db.engine import SYNC_DB_API from danswer.db.models import DocumentSet from danswer.db.tasks import check_live_task_not_timed_out from danswer.db.tasks import get_latest_task +from danswer.document_index import get_default_document_index +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import UpdateRequest from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger diff --git a/backend/danswer/background/connector_deletion.py b/backend/danswer/background/connector_deletion.py index ae2cfec1843..e442bedd87d 100644 --- a/backend/danswer/background/connector_deletion.py +++ b/backend/danswer/background/connector_deletion.py @@ -17,8 +17,6 @@ from typing import cast from sqlalchemy.orm import Session from danswer.access.access import get_access_for_documents -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import UpdateRequest from danswer.db.connector import fetch_connector_by_id from danswer.db.connector_credential_pair import ( delete_connector_credential_pair__no_commit, @@ -35,6 +33,8 @@ from danswer.db.document_set import ( from danswer.db.engine import get_sqlalchemy_engine from danswer.db.index_attempt import delete_index_attempts from danswer.db.models import ConnectorCredentialPair +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import UpdateRequest from danswer.server.models import ConnectorCredentialPairIdentifier from danswer.utils.logger import setup_logger from danswer.utils.variable_functionality import fetch_versioned_implementation diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py index ea610c00d36..ca4a4ecc2a1 100755 --- a/backend/danswer/background/update.py +++ b/backend/danswer/background/update.py @@ -17,7 +17,6 @@ from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.models import IndexAttemptMetadata from danswer.connectors.models import InputType -from danswer.datastores.indexing_pipeline import build_indexing_pipeline from danswer.db.connector import disable_connector from danswer.db.connector import fetch_connectors from danswer.db.connector_credential_pair import get_last_successful_attempt_time @@ -38,7 +37,8 @@ from danswer.db.index_attempt import update_docs_indexed from danswer.db.models import Connector from danswer.db.models import IndexAttempt from danswer.db.models import IndexingStatus -from danswer.search.search_utils import warm_up_models +from danswer.indexing.indexing_pipeline import build_indexing_pipeline +from danswer.search.search_nlp_models import warm_up_models from danswer.utils.logger import IndexAttemptSingleton from danswer.utils.logger import setup_logger diff --git a/backend/danswer/chat/chat_llm.py b/backend/danswer/chat/chat_llm.py index 3216210a85c..ab4cda9a03b 100644 --- a/backend/danswer/chat/chat_llm.py +++ b/backend/danswer/chat/chat_llm.py @@ -20,12 +20,10 @@ from danswer.chat.chat_prompts import REQUIRE_DANSWER_SYSTEM_MSG from danswer.chat.chat_prompts import YES_SEARCH from danswer.chat.personas import build_system_text_from_persona from danswer.chat.tools import call_tool -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_CHAT from danswer.configs.chat_configs import FORCE_TOOL_PROMPT from danswer.configs.constants import IGNORE_FOR_QA from danswer.configs.model_configs import GEN_AI_MAX_INPUT_TOKENS -from danswer.datastores.document_index import get_default_document_index from danswer.db.models import ChatMessage from danswer.db.models import Persona from danswer.db.models import User @@ -33,13 +31,15 @@ from danswer.direct_qa.interfaces import DanswerAnswerPiece from danswer.direct_qa.interfaces import DanswerChatModelOut from danswer.direct_qa.interfaces import StreamingError from danswer.direct_qa.qa_utils import get_usable_chunks +from danswer.document_index import get_default_document_index +from danswer.indexing.models import InferenceChunk from danswer.llm.build import get_default_llm from danswer.llm.llm import LLM from danswer.llm.utils import get_default_llm_tokenizer from danswer.llm.utils import translate_danswer_msg_to_langchain from danswer.search.access_filters import build_access_filters_for_user -from danswer.search.semantic_search import chunks_to_search_docs -from danswer.search.semantic_search import retrieve_ranked_documents +from danswer.search.search_runner import chunks_to_search_docs +from danswer.search.search_runner import retrieve_ranked_documents from danswer.server.models import IndexFilters from danswer.server.models import RetrievalDocs from danswer.utils.logger import setup_logger diff --git a/backend/danswer/chat/chat_prompts.py b/backend/danswer/chat/chat_prompts.py index f21725df3d1..2dfc18552f6 100644 --- a/backend/danswer/chat/chat_prompts.py +++ b/backend/danswer/chat/chat_prompts.py @@ -2,11 +2,11 @@ from langchain.schema.messages import BaseMessage from langchain.schema.messages import HumanMessage from langchain.schema.messages import SystemMessage -from danswer.chunking.models import InferenceChunk from danswer.configs.constants import CODE_BLOCK_PAT from danswer.configs.constants import MessageType from danswer.db.models import ChatMessage from danswer.db.models import ToolInfo +from danswer.indexing.models import InferenceChunk from danswer.llm.utils import translate_danswer_msg_to_langchain DANSWER_TOOL_NAME = "Current Search" diff --git a/backend/danswer/datastores/document_index.py b/backend/danswer/datastores/document_index.py deleted file mode 100644 index 480689e7c1a..00000000000 --- a/backend/danswer/datastores/document_index.py +++ /dev/null @@ -1,12 +0,0 @@ -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.vespa.store import VespaIndex -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -def get_default_document_index() -> DocumentIndex: - # Currently only supporting Vespa - # Supporting multiple index types with multiple - # Search-Engines / VectorDBs was too much overhead - return VespaIndex() diff --git a/backend/danswer/datastores/vespa/__init__.py b/backend/danswer/datastores/vespa/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/backend/danswer/db/document.py b/backend/danswer/db/document.py index 5a3dbb98d39..745e8411230 100644 --- a/backend/danswer/db/document.py +++ b/backend/danswer/db/document.py @@ -11,13 +11,13 @@ from sqlalchemy.dialects.postgresql import insert from sqlalchemy.orm import Session from danswer.configs.constants import DEFAULT_BOOST -from danswer.datastores.interfaces import DocumentMetadata from danswer.db.feedback import delete_document_feedback_for_documents from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Credential from danswer.db.models import Document as DbDocument from danswer.db.models import DocumentByConnectorCredentialPair from danswer.db.utils import model_to_dict +from danswer.document_index.interfaces import DocumentMetadata from danswer.server.models import ConnectorCredentialPairIdentifier from danswer.utils.logger import setup_logger diff --git a/backend/danswer/db/feedback.py b/backend/danswer/db/feedback.py index 58db9a71188..cd02885baae 100644 --- a/backend/danswer/db/feedback.py +++ b/backend/danswer/db/feedback.py @@ -10,13 +10,13 @@ from sqlalchemy.orm import Session from danswer.configs.constants import MessageType from danswer.configs.constants import QAFeedbackType from danswer.configs.constants import SearchFeedbackType -from danswer.datastores.document_index import get_default_document_index -from danswer.datastores.interfaces import UpdateRequest from danswer.db.models import ChatMessage as DbChatMessage from danswer.db.models import ChatMessageFeedback from danswer.db.models import Document as DbDocument from danswer.db.models import DocumentRetrievalFeedback from danswer.db.models import QueryEvent +from danswer.document_index import get_default_document_index +from danswer.document_index.interfaces import UpdateRequest from danswer.search.models import SearchType diff --git a/backend/danswer/direct_qa/answer_question.py b/backend/danswer/direct_qa/answer_question.py index d559dd65152..8a5d03f950a 100644 --- a/backend/danswer/direct_qa/answer_question.py +++ b/backend/danswer/direct_qa/answer_question.py @@ -2,12 +2,10 @@ from collections.abc import Callable from sqlalchemy.orm import Session -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DISABLE_GENERATIVE_AI from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL from danswer.configs.app_configs import QA_TIMEOUT from danswer.configs.constants import IGNORE_FOR_QA -from danswer.datastores.document_index import get_default_document_index from danswer.db.feedback import create_query_event from danswer.db.feedback import update_query_event_retrieved_documents from danswer.db.models import User @@ -16,15 +14,17 @@ from danswer.direct_qa.exceptions import UnknownModelError from danswer.direct_qa.llm_utils import get_default_qa_model from danswer.direct_qa.models import LLMMetricsContainer from danswer.direct_qa.qa_utils import get_usable_chunks +from danswer.document_index import get_default_document_index +from danswer.indexing.models import InferenceChunk from danswer.search.access_filters import build_access_filters_for_user from danswer.search.danswer_helper import query_intent -from danswer.search.keyword_search import retrieve_keyword_documents from danswer.search.models import QueryFlow from danswer.search.models import RerankMetricsContainer from danswer.search.models import RetrievalMetricsContainer from danswer.search.models import SearchType -from danswer.search.semantic_search import chunks_to_search_docs -from danswer.search.semantic_search import retrieve_ranked_documents +from danswer.search.search_runner import chunks_to_search_docs +from danswer.search.search_runner import retrieve_keyword_documents +from danswer.search.search_runner import retrieve_ranked_documents from danswer.secondary_llm_flows.answer_validation import get_answer_validity from danswer.secondary_llm_flows.extract_filters import extract_question_time_filters from danswer.server.models import IndexFilters diff --git a/backend/danswer/direct_qa/gpt_4_all.py b/backend/danswer/direct_qa/gpt_4_all.py index 1840381d0cd..cb12883f035 100644 --- a/backend/danswer/direct_qa/gpt_4_all.py +++ b/backend/danswer/direct_qa/gpt_4_all.py @@ -1,7 +1,6 @@ from collections.abc import Callable from typing import Any -from danswer.chunking.models import InferenceChunk from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS from danswer.configs.model_configs import GEN_AI_MODEL_VERSION from danswer.direct_qa.interfaces import AnswerQuestionReturn @@ -14,6 +13,7 @@ from danswer.direct_qa.qa_prompts import WeakChatModelFreeformProcessor from danswer.direct_qa.qa_prompts import WeakModelFreeformProcessor from danswer.direct_qa.qa_utils import process_answer from danswer.direct_qa.qa_utils import process_model_tokens +from danswer.indexing.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/direct_qa/huggingface.py b/backend/danswer/direct_qa/huggingface.py index bfb292cf6df..3f6168b0a3e 100644 --- a/backend/danswer/direct_qa/huggingface.py +++ b/backend/danswer/direct_qa/huggingface.py @@ -4,7 +4,6 @@ from typing import Any from huggingface_hub import InferenceClient # type:ignore from huggingface_hub.utils import HfHubHTTPError # type:ignore -from danswer.chunking.models import InferenceChunk from danswer.configs.model_configs import GEN_AI_MAX_OUTPUT_TOKENS from danswer.configs.model_configs import GEN_AI_MODEL_VERSION from danswer.direct_qa.interfaces import AnswerQuestionReturn @@ -18,6 +17,7 @@ from danswer.direct_qa.qa_prompts import NonChatPromptProcessor from danswer.direct_qa.qa_utils import process_answer from danswer.direct_qa.qa_utils import process_model_tokens from danswer.direct_qa.qa_utils import simulate_streaming_response +from danswer.indexing.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/direct_qa/interfaces.py b/backend/danswer/direct_qa/interfaces.py index 1365e1de06c..688d0f002bd 100644 --- a/backend/danswer/direct_qa/interfaces.py +++ b/backend/danswer/direct_qa/interfaces.py @@ -4,8 +4,8 @@ from collections.abc import Iterator from pydantic import BaseModel -from danswer.chunking.models import InferenceChunk from danswer.direct_qa.models import LLMMetricsContainer +from danswer.indexing.models import InferenceChunk class StreamingError(BaseModel): diff --git a/backend/danswer/direct_qa/local_transformers.py b/backend/danswer/direct_qa/local_transformers.py index afee49985bf..f0102f863b6 100644 --- a/backend/danswer/direct_qa/local_transformers.py +++ b/backend/danswer/direct_qa/local_transformers.py @@ -4,7 +4,6 @@ from collections.abc import Callable from transformers import pipeline # type:ignore from transformers import QuestionAnsweringPipeline # type:ignore -from danswer.chunking.models import InferenceChunk from danswer.configs.model_configs import GEN_AI_MODEL_VERSION from danswer.direct_qa.interfaces import AnswerQuestionReturn from danswer.direct_qa.interfaces import AnswerQuestionStreamReturn @@ -14,6 +13,7 @@ from danswer.direct_qa.interfaces import DanswerQuote from danswer.direct_qa.interfaces import DanswerQuotes from danswer.direct_qa.interfaces import QAModel from danswer.direct_qa.models import LLMMetricsContainer +from danswer.indexing.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/direct_qa/open_ai.py b/backend/danswer/direct_qa/open_ai.py index 2c2dc1526ec..f8e8bbca87f 100644 --- a/backend/danswer/direct_qa/open_ai.py +++ b/backend/danswer/direct_qa/open_ai.py @@ -12,7 +12,6 @@ import tiktoken from openai.error import AuthenticationError from openai.error import Timeout -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import INCLUDE_METADATA from danswer.configs.model_configs import API_BASE_OPENAI from danswer.configs.model_configs import API_TYPE_OPENAI @@ -31,6 +30,7 @@ from danswer.direct_qa.qa_utils import get_gen_ai_api_key from danswer.direct_qa.qa_utils import process_answer from danswer.direct_qa.qa_utils import process_model_tokens from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.indexing.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/direct_qa/qa_block.py b/backend/danswer/direct_qa/qa_block.py index 931eac2aa7f..4d9f5b3b00f 100644 --- a/backend/danswer/direct_qa/qa_block.py +++ b/backend/danswer/direct_qa/qa_block.py @@ -11,7 +11,6 @@ from langchain.schema.messages import BaseMessage from langchain.schema.messages import HumanMessage from langchain.schema.messages import SystemMessage -from danswer.chunking.models import InferenceChunk from danswer.configs.constants import CODE_BLOCK_PAT from danswer.configs.constants import GENERAL_SEP_PAT from danswer.configs.constants import QUESTION_PAT @@ -28,6 +27,7 @@ from danswer.direct_qa.qa_prompts import JsonChatProcessor from danswer.direct_qa.qa_prompts import WeakModelFreeformProcessor from danswer.direct_qa.qa_utils import process_answer from danswer.direct_qa.qa_utils import process_model_tokens +from danswer.indexing.models import InferenceChunk from danswer.llm.llm import LLM from danswer.llm.utils import check_number_of_tokens from danswer.llm.utils import dict_based_prompt_to_langchain_prompt diff --git a/backend/danswer/direct_qa/qa_prompts.py b/backend/danswer/direct_qa/qa_prompts.py index aa7d5d01d55..4439afea9b1 100644 --- a/backend/danswer/direct_qa/qa_prompts.py +++ b/backend/danswer/direct_qa/qa_prompts.py @@ -1,7 +1,6 @@ import abc import json -from danswer.chunking.models import InferenceChunk from danswer.configs.constants import ANSWER_PAT from danswer.configs.constants import DOC_CONTENT_START_PAT from danswer.configs.constants import DOC_SEP_PAT @@ -11,6 +10,7 @@ from danswer.configs.constants import QUESTION_PAT from danswer.configs.constants import QUOTE_PAT from danswer.configs.constants import UNCERTAINTY_PAT from danswer.connectors.factory import identify_connector_class +from danswer.indexing.models import InferenceChunk BASE_PROMPT = ( diff --git a/backend/danswer/direct_qa/qa_utils.py b/backend/danswer/direct_qa/qa_utils.py index 4fd5f58419e..df4bdd6d56d 100644 --- a/backend/danswer/direct_qa/qa_utils.py +++ b/backend/danswer/direct_qa/qa_utils.py @@ -9,7 +9,6 @@ from typing import Tuple import regex -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT from danswer.configs.constants import GEN_AI_API_KEY_STORAGE_KEY @@ -23,6 +22,7 @@ from danswer.direct_qa.qa_prompts import QUOTE_PAT from danswer.direct_qa.qa_prompts import UNCERTAINTY_PAT from danswer.dynamic_configs import get_dynamic_config_store from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.indexing.models import InferenceChunk from danswer.llm.utils import check_number_of_tokens from danswer.utils.logger import setup_logger from danswer.utils.text_processing import clean_model_quote diff --git a/backend/danswer/direct_qa/request_model.py b/backend/danswer/direct_qa/request_model.py index 51d29a9e389..32f98123492 100644 --- a/backend/danswer/direct_qa/request_model.py +++ b/backend/danswer/direct_qa/request_model.py @@ -7,7 +7,6 @@ import requests from requests.exceptions import Timeout from requests.models import Response -from danswer.chunking.models import InferenceChunk from danswer.configs.constants import ModelHostType from danswer.configs.model_configs import GEN_AI_API_KEY from danswer.configs.model_configs import GEN_AI_ENDPOINT @@ -22,6 +21,7 @@ from danswer.direct_qa.qa_prompts import NonChatPromptProcessor from danswer.direct_qa.qa_utils import process_answer from danswer.direct_qa.qa_utils import process_model_tokens from danswer.direct_qa.qa_utils import simulate_streaming_response +from danswer.indexing.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/document_index/__init__.py b/backend/danswer/document_index/__init__.py new file mode 100644 index 00000000000..40a44cf6aa3 --- /dev/null +++ b/backend/danswer/document_index/__init__.py @@ -0,0 +1,7 @@ +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.vespa.index import VespaIndex + + +def get_default_document_index() -> DocumentIndex: + # Currently only supporting Vespa + return VespaIndex() diff --git a/backend/danswer/datastores/datastore_utils.py b/backend/danswer/document_index/document_index_utils.py similarity index 92% rename from backend/danswer/datastores/datastore_utils.py rename to backend/danswer/document_index/document_index_utils.py index c9565db5c7d..47986aa5bf5 100644 --- a/backend/danswer/datastores/datastore_utils.py +++ b/backend/danswer/document_index/document_index_utils.py @@ -1,8 +1,8 @@ import math import uuid -from danswer.chunking.models import IndexChunk -from danswer.chunking.models import InferenceChunk +from danswer.indexing.models import IndexChunk +from danswer.indexing.models import InferenceChunk DEFAULT_BATCH_SIZE = 30 diff --git a/backend/danswer/datastores/interfaces.py b/backend/danswer/document_index/interfaces.py similarity index 97% rename from backend/danswer/datastores/interfaces.py rename to backend/danswer/document_index/interfaces.py index d3f77dea9e0..473e75788b8 100644 --- a/backend/danswer/datastores/interfaces.py +++ b/backend/danswer/document_index/interfaces.py @@ -4,9 +4,9 @@ from datetime import datetime from typing import Any from danswer.access.models import DocumentAccess -from danswer.chunking.models import DocMetadataAwareIndexChunk -from danswer.chunking.models import InferenceChunk from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF +from danswer.indexing.models import DocMetadataAwareIndexChunk +from danswer.indexing.models import InferenceChunk from danswer.server.models import IndexFilters diff --git a/backend/danswer/chunking/__init__.py b/backend/danswer/document_index/vespa/__init__.py similarity index 100% rename from backend/danswer/chunking/__init__.py rename to backend/danswer/document_index/vespa/__init__.py diff --git a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd similarity index 95% rename from backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd rename to backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index 5c4f49e2618..aed8184dc95 100644 --- a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -138,19 +138,18 @@ schema danswer_chunk { match-features: recency_bias closest(embeddings) } - # TODO this isn't used and needs to be reworked rank-profile hybrid_search inherits default, default_rank { inputs { query(query_embedding) tensor(x[384]) } first-phase { - expression: bm25(content) * document_boost * recency_bias + expression: closeness(field, embeddings) } - second-phase { - # Cannot do boost with the chosen embedding model because of high default similarity - expression: closeness(field, embeddings) + global-phase { + expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) * document_boost * recency_bias + rerank-count: 1000 } match-features: recency_bias closest(embeddings) diff --git a/backend/danswer/datastores/vespa/app_config/services.xml b/backend/danswer/document_index/vespa/app_config/services.xml similarity index 100% rename from backend/danswer/datastores/vespa/app_config/services.xml rename to backend/danswer/document_index/vespa/app_config/services.xml diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/document_index/vespa/index.py similarity index 97% rename from backend/danswer/datastores/vespa/store.py rename to backend/danswer/document_index/vespa/index.py index 4bb14c6a09d..97268c9a894 100644 --- a/backend/danswer/datastores/vespa/store.py +++ b/backend/danswer/document_index/vespa/index.py @@ -15,8 +15,6 @@ from requests import HTTPError from requests import Response from retry import retry -from danswer.chunking.models import DocMetadataAwareIndexChunk -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DOC_TIME_DECAY from danswer.configs.app_configs import DOCUMENT_INDEX_NAME from danswer.configs.app_configs import EDIT_KEYWORD_QUERY @@ -47,14 +45,16 @@ from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE from danswer.configs.constants import TITLE from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF -from danswer.datastores.datastore_utils import get_uuid_from_chunk -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import DocumentInsertionRecord -from danswer.datastores.interfaces import IndexFilters -from danswer.datastores.interfaces import UpdateRequest -from danswer.datastores.vespa.utils import remove_invalid_unicode_chars -from danswer.search.keyword_search import remove_stop_words -from danswer.search.semantic_search import embed_query +from danswer.document_index.document_index_utils import get_uuid_from_chunk +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import DocumentInsertionRecord +from danswer.document_index.interfaces import IndexFilters +from danswer.document_index.interfaces import UpdateRequest +from danswer.document_index.vespa.utils import remove_invalid_unicode_chars +from danswer.indexing.models import DocMetadataAwareIndexChunk +from danswer.indexing.models import InferenceChunk +from danswer.search.search_runner import embed_query +from danswer.search.search_runner import remove_stop_words from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger diff --git a/backend/danswer/datastores/vespa/utils.py b/backend/danswer/document_index/vespa/utils.py similarity index 100% rename from backend/danswer/datastores/vespa/utils.py rename to backend/danswer/document_index/vespa/utils.py diff --git a/backend/danswer/datastores/__init__.py b/backend/danswer/indexing/__init__.py similarity index 100% rename from backend/danswer/datastores/__init__.py rename to backend/danswer/indexing/__init__.py diff --git a/backend/danswer/chunking/chunk.py b/backend/danswer/indexing/chunker.py similarity index 97% rename from backend/danswer/chunking/chunk.py rename to backend/danswer/indexing/chunker.py index 3dad17f8371..c3d5b6e72d2 100644 --- a/backend/danswer/chunking/chunk.py +++ b/backend/danswer/indexing/chunker.py @@ -4,14 +4,14 @@ from collections.abc import Callable from llama_index.text_splitter import SentenceSplitter from transformers import AutoTokenizer # type:ignore -from danswer.chunking.models import DocAwareChunk from danswer.configs.app_configs import BLURB_SIZE from danswer.configs.app_configs import CHUNK_OVERLAP from danswer.configs.app_configs import CHUNK_SIZE from danswer.configs.app_configs import MINI_CHUNK_SIZE from danswer.connectors.models import Document from danswer.connectors.models import Section -from danswer.search.search_utils import get_default_tokenizer +from danswer.indexing.models import DocAwareChunk +from danswer.search.search_nlp_models import get_default_tokenizer from danswer.utils.text_processing import shared_precompare_cleanup diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py new file mode 100644 index 00000000000..8aa3471f6b3 --- /dev/null +++ b/backend/danswer/indexing/embedder.py @@ -0,0 +1,77 @@ +import numpy +from sentence_transformers import SentenceTransformer # type: ignore + +from danswer.configs.app_configs import ENABLE_MINI_CHUNK +from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX +from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS +from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS +from danswer.indexing.chunker import split_chunk_text_into_mini_chunks +from danswer.indexing.models import ChunkEmbedding +from danswer.indexing.models import DocAwareChunk +from danswer.indexing.models import IndexChunk +from danswer.search.models import Embedder +from danswer.search.search_nlp_models import get_default_embedding_model +from danswer.utils.timing import log_function_time + + +@log_function_time() +def encode_chunks( + chunks: list[DocAwareChunk], + embedding_model: SentenceTransformer | None = None, + batch_size: int = BATCH_SIZE_ENCODE_CHUNKS, + enable_mini_chunk: bool = ENABLE_MINI_CHUNK, + passage_prefix: str = ASYM_PASSAGE_PREFIX, +) -> list[IndexChunk]: + embedded_chunks: list[IndexChunk] = [] + if embedding_model is None: + embedding_model = get_default_embedding_model() + + chunk_texts = [] + chunk_mini_chunks_count = {} + for chunk_ind, chunk in enumerate(chunks): + chunk_texts.append(passage_prefix + chunk.content) + mini_chunk_texts = ( + split_chunk_text_into_mini_chunks(chunk.content) + if enable_mini_chunk + else [] + ) + prefixed_mini_chunk_texts = [passage_prefix + text for text in mini_chunk_texts] + chunk_texts.extend(prefixed_mini_chunk_texts) + chunk_mini_chunks_count[chunk_ind] = 1 + len(prefixed_mini_chunk_texts) + + text_batches = [ + chunk_texts[i : i + batch_size] for i in range(0, len(chunk_texts), batch_size) + ] + + embeddings_np: list[numpy.ndarray] = [] + for text_batch in text_batches: + # Normalize embeddings is only configured via model_configs.py, be sure to use right value for the set loss + embeddings_np.extend( + embedding_model.encode( + text_batch, normalize_embeddings=NORMALIZE_EMBEDDINGS + ) + ) + embeddings: list[list[float]] = [embedding.tolist() for embedding in embeddings_np] + + embedding_ind_start = 0 + for chunk_ind, chunk in enumerate(chunks): + num_embeddings = chunk_mini_chunks_count[chunk_ind] + chunk_embeddings = embeddings[ + embedding_ind_start : embedding_ind_start + num_embeddings + ] + new_embedded_chunk = IndexChunk( + **{k: getattr(chunk, k) for k in chunk.__dataclass_fields__}, + embeddings=ChunkEmbedding( + full_embedding=chunk_embeddings[0], + mini_chunk_embeddings=chunk_embeddings[1:], + ), + ) + embedded_chunks.append(new_embedded_chunk) + embedding_ind_start += num_embeddings + + return embedded_chunks + + +class DefaultEmbedder(Embedder): + def embed(self, chunks: list[DocAwareChunk]) -> list[IndexChunk]: + return encode_chunks(chunks) diff --git a/backend/danswer/datastores/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py similarity index 91% rename from backend/danswer/datastores/indexing_pipeline.py rename to backend/danswer/indexing/indexing_pipeline.py index e9064f50eb2..1d41c681c5f 100644 --- a/backend/danswer/datastores/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -5,21 +5,21 @@ from typing import Protocol from sqlalchemy.orm import Session from danswer.access.access import get_access_for_documents -from danswer.chunking.chunk import Chunker -from danswer.chunking.chunk import DefaultChunker -from danswer.chunking.models import DocAwareChunk -from danswer.chunking.models import DocMetadataAwareIndexChunk from danswer.connectors.models import Document from danswer.connectors.models import IndexAttemptMetadata -from danswer.datastores.document_index import get_default_document_index -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import DocumentMetadata from danswer.db.document import prepare_to_modify_documents from danswer.db.document import upsert_documents_complete from danswer.db.document_set import fetch_document_sets_for_documents from danswer.db.engine import get_sqlalchemy_engine +from danswer.document_index import get_default_document_index +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import DocumentMetadata +from danswer.indexing.chunker import Chunker +from danswer.indexing.chunker import DefaultChunker +from danswer.indexing.embedder import DefaultEmbedder +from danswer.indexing.models import DocAwareChunk +from danswer.indexing.models import DocMetadataAwareIndexChunk from danswer.search.models import Embedder -from danswer.search.semantic_search import DefaultEmbedder from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/danswer/chunking/models.py b/backend/danswer/indexing/models.py similarity index 100% rename from backend/danswer/chunking/models.py rename to backend/danswer/indexing/models.py diff --git a/backend/danswer/main.py b/backend/danswer/main.py index dab059d6c51..b0b452ec1eb 100644 --- a/backend/danswer/main.py +++ b/backend/danswer/main.py @@ -30,9 +30,9 @@ from danswer.configs.model_configs import DOCUMENT_ENCODER_MODEL from danswer.configs.model_configs import GEN_AI_MODEL_VERSION from danswer.configs.model_configs import INTERNAL_MODEL_VERSION from danswer.configs.model_configs import SKIP_RERANKING -from danswer.datastores.document_index import get_default_document_index from danswer.db.credentials import create_initial_public_credential from danswer.direct_qa.llm_utils import get_default_qa_model +from danswer.document_index import get_default_document_index from danswer.server.cc_pair.api import router as cc_pair_router from danswer.server.chat_backend import router as chat_router from danswer.server.connector import router as connector_router @@ -148,7 +148,7 @@ def get_application() -> FastAPI: @application.on_event("startup") def startup_event() -> None: # To avoid circular imports - from danswer.search.search_utils import ( + from danswer.search.search_nlp_models import ( warm_up_models, ) diff --git a/backend/danswer/search/danswer_helper.py b/backend/danswer/search/danswer_helper.py index 3d5d788e3d2..d2a88f1f5f7 100644 --- a/backend/danswer/search/danswer_helper.py +++ b/backend/danswer/search/danswer_helper.py @@ -2,12 +2,12 @@ import numpy as np import tensorflow as tf # type:ignore from transformers import AutoTokenizer # type:ignore -from danswer.search.keyword_search import remove_stop_words from danswer.search.models import QueryFlow from danswer.search.models import SearchType -from danswer.search.search_utils import get_default_intent_model -from danswer.search.search_utils import get_default_intent_model_tokenizer -from danswer.search.search_utils import get_default_tokenizer +from danswer.search.search_nlp_models import get_default_intent_model +from danswer.search.search_nlp_models import get_default_intent_model_tokenizer +from danswer.search.search_nlp_models import get_default_tokenizer +from danswer.search.search_runner import remove_stop_words from danswer.server.models import HelperResponse from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time diff --git a/backend/danswer/search/keyword_search.py b/backend/danswer/search/keyword_search.py deleted file mode 100644 index 25ad1c91143..00000000000 --- a/backend/danswer/search/keyword_search.py +++ /dev/null @@ -1,79 +0,0 @@ -from collections.abc import Callable - -from nltk.corpus import stopwords # type:ignore -from nltk.stem import WordNetLemmatizer # type:ignore -from nltk.tokenize import word_tokenize # type:ignore - -from danswer.chunking.models import InferenceChunk -from danswer.configs.app_configs import EDIT_KEYWORD_QUERY -from danswer.configs.app_configs import NUM_RETURNED_HITS -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import IndexFilters -from danswer.search.models import ChunkMetric -from danswer.search.models import MAX_METRICS_CONTENT -from danswer.search.models import RetrievalMetricsContainer -from danswer.utils.logger import setup_logger -from danswer.utils.timing import log_function_time - -logger = setup_logger() - - -def lemmatize_text(text: str) -> list[str]: - lemmatizer = WordNetLemmatizer() - word_tokens = word_tokenize(text) - return [lemmatizer.lemmatize(word) for word in word_tokens] - - -def remove_stop_words(text: str) -> list[str]: - stop_words = set(stopwords.words("english")) - word_tokens = word_tokenize(text) - text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words] - return text_trimmed or word_tokens - - -def query_processing( - query: str, -) -> str: - query = " ".join(remove_stop_words(query)) - query = " ".join(lemmatize_text(query)) - return query - - -@log_function_time() -def retrieve_keyword_documents( - query: str, - filters: IndexFilters, - favor_recent: bool, - datastore: DocumentIndex, - num_hits: int = NUM_RETURNED_HITS, - edit_query: bool = EDIT_KEYWORD_QUERY, - retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None] - | None = None, -) -> list[InferenceChunk] | None: - edited_query = query_processing(query) if edit_query else query - - top_chunks = datastore.keyword_retrieval( - edited_query, filters, favor_recent, num_hits - ) - - if not top_chunks: - logger.warning( - f"Keyword search returned no results - Filters: {filters}\tEdited Query: {edited_query}" - ) - return None - - if retrieval_metrics_callback is not None: - chunk_metrics = [ - ChunkMetric( - document_id=chunk.document_id, - chunk_content_start=chunk.content[:MAX_METRICS_CONTENT], - first_link=chunk.source_links[0] if chunk.source_links else None, - score=chunk.score if chunk.score is not None else 0, - ) - for chunk in top_chunks - ] - retrieval_metrics_callback( - RetrievalMetricsContainer(keyword_search=True, metrics=chunk_metrics) - ) - - return top_chunks diff --git a/backend/danswer/search/models.py b/backend/danswer/search/models.py index d11ba5d6c5c..aacbf67679a 100644 --- a/backend/danswer/search/models.py +++ b/backend/danswer/search/models.py @@ -2,8 +2,8 @@ from enum import Enum from pydantic import BaseModel -from danswer.chunking.models import DocAwareChunk -from danswer.chunking.models import IndexChunk +from danswer.indexing.models import DocAwareChunk +from danswer.indexing.models import IndexChunk MAX_METRICS_CONTENT = ( @@ -12,8 +12,9 @@ MAX_METRICS_CONTENT = ( class SearchType(str, Enum): - KEYWORD = "keyword" # May be better to also try keyword search if Semantic (AI Search) is on - SEMANTIC = "semantic" # Really should try Semantic (AI Search) if keyword is on + KEYWORD = "keyword" + SEMANTIC = "semantic" + HYBRID = "hybrid" class QueryFlow(str, Enum): diff --git a/backend/danswer/search/search_utils.py b/backend/danswer/search/search_nlp_models.py similarity index 100% rename from backend/danswer/search/search_utils.py rename to backend/danswer/search/search_nlp_models.py diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/search_runner.py similarity index 76% rename from backend/danswer/search/semantic_search.py rename to backend/danswer/search/search_runner.py index 5249d02c964..29b5846c855 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/search_runner.py @@ -1,35 +1,33 @@ from collections.abc import Callable import numpy +from nltk.corpus import stopwords # type:ignore +from nltk.stem import WordNetLemmatizer # type:ignore +from nltk.tokenize import word_tokenize # type:ignore from sentence_transformers import SentenceTransformer # type: ignore -from danswer.chunking.chunk import split_chunk_text_into_mini_chunks -from danswer.chunking.models import ChunkEmbedding -from danswer.chunking.models import DocAwareChunk -from danswer.chunking.models import IndexChunk -from danswer.chunking.models import InferenceChunk -from danswer.configs.app_configs import ENABLE_MINI_CHUNK +from danswer.configs.app_configs import EDIT_KEYWORD_QUERY from danswer.configs.app_configs import NUM_RERANKED_RESULTS from danswer.configs.app_configs import NUM_RETURNED_HITS -from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX from danswer.configs.model_configs import ASYM_QUERY_PREFIX -from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS from danswer.configs.model_configs import SIM_SCORE_RANGE_HIGH from danswer.configs.model_configs import SIM_SCORE_RANGE_LOW from danswer.configs.model_configs import SKIP_RERANKING -from danswer.datastores.datastore_utils import translate_boost_count_to_multiplier -from danswer.datastores.interfaces import DocumentIndex -from danswer.datastores.interfaces import IndexFilters +from danswer.document_index.document_index_utils import ( + translate_boost_count_to_multiplier, +) +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import IndexFilters +from danswer.indexing.models import InferenceChunk from danswer.search.models import ChunkMetric -from danswer.search.models import Embedder from danswer.search.models import MAX_METRICS_CONTENT from danswer.search.models import RerankMetricsContainer from danswer.search.models import RetrievalMetricsContainer -from danswer.search.search_utils import get_default_embedding_model -from danswer.search.search_utils import get_default_reranking_model_ensemble +from danswer.search.search_nlp_models import get_default_embedding_model +from danswer.search.search_nlp_models import get_default_reranking_model_ensemble from danswer.server.models import SearchDoc from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time @@ -37,6 +35,27 @@ from danswer.utils.timing import log_function_time logger = setup_logger() +def lemmatize_text(text: str) -> list[str]: + lemmatizer = WordNetLemmatizer() + word_tokens = word_tokenize(text) + return [lemmatizer.lemmatize(word) for word in word_tokens] + + +def remove_stop_words(text: str) -> list[str]: + stop_words = set(stopwords.words("english")) + word_tokens = word_tokenize(text) + text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words] + return text_trimmed or word_tokens + + +def query_processing( + query: str, +) -> str: + query = " ".join(remove_stop_words(query)) + query = " ".join(lemmatize_text(query)) + return query + + def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc]: search_docs = ( [ @@ -214,6 +233,46 @@ def apply_boost( return final_chunks +@log_function_time() +def retrieve_keyword_documents( + query: str, + filters: IndexFilters, + favor_recent: bool, + datastore: DocumentIndex, + num_hits: int = NUM_RETURNED_HITS, + edit_query: bool = EDIT_KEYWORD_QUERY, + retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None] + | None = None, +) -> list[InferenceChunk] | None: + edited_query = query_processing(query) if edit_query else query + + top_chunks = datastore.keyword_retrieval( + edited_query, filters, favor_recent, num_hits + ) + + if not top_chunks: + logger.warning( + f"Keyword search returned no results - Filters: {filters}\tEdited Query: {edited_query}" + ) + return None + + if retrieval_metrics_callback is not None: + chunk_metrics = [ + ChunkMetric( + document_id=chunk.document_id, + chunk_content_start=chunk.content[:MAX_METRICS_CONTENT], + first_link=chunk.source_links[0] if chunk.source_links else None, + score=chunk.score if chunk.score is not None else 0, + ) + for chunk in top_chunks + ] + retrieval_metrics_callback( + RetrievalMetricsContainer(keyword_search=True, metrics=chunk_metrics) + ) + + return top_chunks + + @log_function_time() def retrieve_ranked_documents( query: str, @@ -277,64 +336,6 @@ def retrieve_ranked_documents( return ranked_chunks, top_chunks[num_rerank:] -@log_function_time() -def encode_chunks( - chunks: list[DocAwareChunk], - embedding_model: SentenceTransformer | None = None, - batch_size: int = BATCH_SIZE_ENCODE_CHUNKS, - enable_mini_chunk: bool = ENABLE_MINI_CHUNK, - passage_prefix: str = ASYM_PASSAGE_PREFIX, -) -> list[IndexChunk]: - embedded_chunks: list[IndexChunk] = [] - if embedding_model is None: - embedding_model = get_default_embedding_model() - - chunk_texts = [] - chunk_mini_chunks_count = {} - for chunk_ind, chunk in enumerate(chunks): - chunk_texts.append(passage_prefix + chunk.content) - mini_chunk_texts = ( - split_chunk_text_into_mini_chunks(chunk.content) - if enable_mini_chunk - else [] - ) - prefixed_mini_chunk_texts = [passage_prefix + text for text in mini_chunk_texts] - chunk_texts.extend(prefixed_mini_chunk_texts) - chunk_mini_chunks_count[chunk_ind] = 1 + len(prefixed_mini_chunk_texts) - - text_batches = [ - chunk_texts[i : i + batch_size] for i in range(0, len(chunk_texts), batch_size) - ] - - embeddings_np: list[numpy.ndarray] = [] - for text_batch in text_batches: - # Normalize embeddings is only configured via model_configs.py, be sure to use right value for the set loss - embeddings_np.extend( - embedding_model.encode( - text_batch, normalize_embeddings=NORMALIZE_EMBEDDINGS - ) - ) - embeddings: list[list[float]] = [embedding.tolist() for embedding in embeddings_np] - - embedding_ind_start = 0 - for chunk_ind, chunk in enumerate(chunks): - num_embeddings = chunk_mini_chunks_count[chunk_ind] - chunk_embeddings = embeddings[ - embedding_ind_start : embedding_ind_start + num_embeddings - ] - new_embedded_chunk = IndexChunk( - **{k: getattr(chunk, k) for k in chunk.__dataclass_fields__}, - embeddings=ChunkEmbedding( - full_embedding=chunk_embeddings[0], - mini_chunk_embeddings=chunk_embeddings[1:], - ), - ) - embedded_chunks.append(new_embedded_chunk) - embedding_ind_start += num_embeddings - - return embedded_chunks - - def embed_query( query: str, embedding_model: SentenceTransformer | None = None, @@ -351,8 +352,3 @@ def embed_query( query_embedding = query_embedding.tolist() return query_embedding - - -class DefaultEmbedder(Embedder): - def embed(self, chunks: list[DocAwareChunk]) -> list[IndexChunk]: - return encode_chunks(chunks) diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index d945e15d2b5..8037ddfd292 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -202,11 +202,12 @@ class IndexFilters(RequestFilters): class QuestionRequest(BaseModel): query: str collection: str - use_keyword: bool | None filters: RequestFilters offset: int | None enable_auto_detect_filters: bool favor_recent: bool | None = None + use_keyword: bool = False # TODO remove this for hybrid search + search_flow: SearchType | None = None # Default hybrid class QAFeedbackRequest(BaseModel): diff --git a/backend/danswer/server/search_backend.py b/backend/danswer/server/search_backend.py index 86656e1a5e4..4e3e75d620b 100644 --- a/backend/danswer/server/search_backend.py +++ b/backend/danswer/server/search_backend.py @@ -9,12 +9,9 @@ from sqlalchemy.orm import Session from danswer.auth.users import current_admin_user from danswer.auth.users import current_user -from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DISABLE_GENERATIVE_AI from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL from danswer.configs.constants import IGNORE_FOR_QA -from danswer.datastores.document_index import get_default_document_index -from danswer.datastores.vespa.store import VespaIndex from danswer.db.engine import get_session from danswer.db.feedback import create_doc_retrieval_feedback from danswer.db.feedback import create_query_event @@ -28,14 +25,17 @@ from danswer.direct_qa.interfaces import DanswerAnswerPiece from danswer.direct_qa.interfaces import StreamingError from danswer.direct_qa.llm_utils import get_default_qa_model from danswer.direct_qa.qa_utils import get_usable_chunks +from danswer.document_index import get_default_document_index +from danswer.document_index.vespa.index import VespaIndex +from danswer.indexing.models import InferenceChunk from danswer.search.access_filters import build_access_filters_for_user from danswer.search.danswer_helper import query_intent from danswer.search.danswer_helper import recommend_search_flow -from danswer.search.keyword_search import retrieve_keyword_documents from danswer.search.models import QueryFlow from danswer.search.models import SearchType -from danswer.search.semantic_search import chunks_to_search_docs -from danswer.search.semantic_search import retrieve_ranked_documents +from danswer.search.search_runner import chunks_to_search_docs +from danswer.search.search_runner import retrieve_keyword_documents +from danswer.search.search_runner import retrieve_ranked_documents from danswer.secondary_llm_flows.extract_filters import extract_question_time_filters from danswer.secondary_llm_flows.query_validation import get_query_answerability from danswer.secondary_llm_flows.query_validation import stream_query_answerability @@ -137,6 +137,67 @@ def stream_query_validation( ) +@router.post("/keyword-search") +def keyword_search( + question: QuestionRequest, + user: User = Depends(current_user), + db_session: Session = Depends(get_session), +) -> SearchResponse: + query = question.query + logger.info(f"Received keyword search query: {query}") + + time_cutoff, favor_recent = extract_question_time_filters(question) + question.filters.time_cutoff = time_cutoff + filters = question.filters + + query_event_id = create_query_event( + query=query, + selected_flow=SearchType.KEYWORD, + llm_answer=None, + user_id=user.id, + db_session=db_session, + ) + + user_id = None if user is None else user.id + user_acl_filters = build_access_filters_for_user(user, db_session) + final_filters = IndexFilters( + source_type=filters.source_type, + document_set=filters.document_set, + time_cutoff=filters.time_cutoff, + access_control_list=user_acl_filters, + ) + ranked_chunks = retrieve_keyword_documents( + query=query, + filters=final_filters, + favor_recent=favor_recent, + datastore=get_default_document_index(), + ) + if not ranked_chunks: + return SearchResponse( + top_ranked_docs=None, + lower_ranked_docs=None, + query_event_id=query_event_id, + time_cutoff=time_cutoff, + favor_recent=favor_recent, + ) + + top_docs = chunks_to_search_docs(ranked_chunks) + update_query_event_retrieved_documents( + db_session=db_session, + retrieved_document_ids=[doc.document_id for doc in top_docs], + query_id=query_event_id, + user_id=user_id, + ) + + return SearchResponse( + top_ranked_docs=top_docs, + lower_ranked_docs=None, + query_event_id=query_event_id, + time_cutoff=time_cutoff, + favor_recent=favor_recent, + ) + + @router.post("/semantic-search") def semantic_search( question: QuestionRequest, @@ -199,14 +260,15 @@ def semantic_search( ) -@router.post("/keyword-search") -def keyword_search( +# TODO don't use this, not done yet +@router.post("/hybrid-search") +def hybrid_search( question: QuestionRequest, user: User = Depends(current_user), db_session: Session = Depends(get_session), ) -> SearchResponse: query = question.query - logger.info(f"Received keyword search query: {query}") + logger.info(f"Received hybrid search query: {query}") time_cutoff, favor_recent = extract_question_time_filters(question) question.filters.time_cutoff = time_cutoff @@ -214,7 +276,7 @@ def keyword_search( query_event_id = create_query_event( query=query, - selected_flow=SearchType.KEYWORD, + selected_flow=SearchType.HYBRID, llm_answer=None, user_id=user.id, db_session=db_session, @@ -228,7 +290,7 @@ def keyword_search( time_cutoff=filters.time_cutoff, access_control_list=user_acl_filters, ) - ranked_chunks = retrieve_keyword_documents( + ranked_chunks, unranked_chunks = retrieve_ranked_documents( query=query, filters=final_filters, favor_recent=favor_recent, @@ -244,6 +306,7 @@ def keyword_search( ) top_docs = chunks_to_search_docs(ranked_chunks) + other_top_docs = chunks_to_search_docs(unranked_chunks) update_query_event_retrieved_documents( db_session=db_session, retrieved_document_ids=[doc.document_id for doc in top_docs], @@ -253,7 +316,7 @@ def keyword_search( return SearchResponse( top_ranked_docs=top_docs, - lower_ranked_docs=None, + lower_ranked_docs=other_top_docs, query_event_id=query_event_id, time_cutoff=time_cutoff, favor_recent=favor_recent, diff --git a/backend/danswer/utils/acl.py b/backend/danswer/utils/acl.py index bed2683f8c2..aa6576ceb2e 100644 --- a/backend/danswer/utils/acl.py +++ b/backend/danswer/utils/acl.py @@ -4,12 +4,12 @@ from sqlalchemy import select from sqlalchemy.orm import Session from danswer.access.models import DocumentAccess -from danswer.datastores.document_index import get_default_document_index -from danswer.datastores.interfaces import UpdateRequest -from danswer.datastores.vespa.store import VespaIndex from danswer.db.document import get_acccess_info_for_documents from danswer.db.engine import get_sqlalchemy_engine from danswer.db.models import Document +from danswer.document_index import get_default_document_index +from danswer.document_index.interfaces import UpdateRequest +from danswer.document_index.vespa.index import VespaIndex from danswer.dynamic_configs import get_dynamic_config_store from danswer.dynamic_configs.interface import ConfigNotFoundError from danswer.utils.logger import setup_logger diff --git a/backend/scripts/reset_indexes.py b/backend/scripts/reset_indexes.py index 9e8c86aad6f..c8ffe5620b8 100644 --- a/backend/scripts/reset_indexes.py +++ b/backend/scripts/reset_indexes.py @@ -2,7 +2,7 @@ import requests from danswer.configs.app_configs import DOCUMENT_INDEX_NAME -from danswer.datastores.vespa.store import DOCUMENT_ID_ENDPOINT +from danswer.document_index.vespa.index import DOCUMENT_ID_ENDPOINT from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/scripts/save_load_state.py b/backend/scripts/save_load_state.py index cfb0c3f7e3c..eeb57323bd7 100644 --- a/backend/scripts/save_load_state.py +++ b/backend/scripts/save_load_state.py @@ -13,7 +13,7 @@ from danswer.configs.app_configs import POSTGRES_HOST from danswer.configs.app_configs import POSTGRES_PASSWORD from danswer.configs.app_configs import POSTGRES_PORT from danswer.configs.app_configs import POSTGRES_USER -from danswer.datastores.vespa.store import DOCUMENT_ID_ENDPOINT +from danswer.document_index.vespa.index import DOCUMENT_ID_ENDPOINT from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py index d6b56bb4653..e2551560ce0 100644 --- a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py +++ b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py @@ -1,9 +1,9 @@ import textwrap import unittest -from danswer.chunking.models import InferenceChunk from danswer.direct_qa.qa_utils import match_quotes_to_docs from danswer.direct_qa.qa_utils import separate_answer_quotes +from danswer.indexing.models import InferenceChunk class TestQAPostprocessing(unittest.TestCase):