taking out Extraction for now

2025-09-29 05:15:12 +02:00 · 2025-01-24 15:37:48 -08:00
parent fc60fd0322
commit aa8cb44a33
9 changed files with 135 additions and 38 deletions
--- a/backend/onyx/agents/agent_search/deep_search_a/answer_initial_sub_question/nodes/answer_check.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/answer_initial_sub_question/nodes/answer_check.py
@@ -27,7 +27,7 @@ def answer_check(state: AnswerQuestionState, config: RunnableConfig) -> QACheckU
        return QACheckUpdate(
            answer_quality=SUB_CHECK_NO,
            log_messages=[
-                f"{now_end} -- Answer check SQ-{level}-{question_num} - unknown answer,  Time taken: {now_end - now_start}"
+                f"{now_start} -- Answer check SQ-{level}-{question_num} - unknown answer,  Time taken: {now_end - now_start}"
            ],
        )
    msg = [
@@ -53,7 +53,7 @@ def answer_check(state: AnswerQuestionState, config: RunnableConfig) -> QACheckU
    return QACheckUpdate(
        answer_quality=quality_str,
        log_messages=[
-            f"""{now_end} -- Answer check SQ-{level}-{question_num} - Answer quality: {quality_str},
+            f"""{now_start} -- Answer check SQ-{level}-{question_num} - Answer quality: {quality_str},
 Time taken: {now_end - now_start}"""
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/graph_builder.py
@@ -32,9 +32,14 @@ from onyx.agents.agent_search.deep_search_a.main.nodes.agent_search_start import
 from onyx.agents.agent_search.deep_search_a.main.nodes.answer_comparison import (
    answer_comparison,
 )
 from onyx.agents.agent_search.deep_search_a.main.nodes.entity_term_extraction_llm import (
    entity_term_extraction_llm,
 )
 from onyx.agents.agent_search.deep_search_a.main.nodes.direct_llm_handling import (
    direct_llm_handling,
 )
 from onyx.agents.agent_search.deep_search_a.main.nodes.generate_initial_answer import (
    generate_initial_answer,
 )
@@ -197,10 +202,10 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        action=initial_answer_quality_check,
    )
-    graph.add_node(
+    # graph.add_node(
-        node="entity_term_extraction_llm",
+    #     node="entity_term_extraction_llm",
-        action=entity_term_extraction_llm,
+    #     action=entity_term_extraction_llm,
-    )
+    # )
    graph.add_node(
        node="refined_answer_decision",
        action=refined_answer_decision,
@@ -259,10 +264,10 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        end_key="base_raw_search_subgraph",
    )
-    graph.add_edge(
+    # graph.add_edge(
-        start_key="agent_search_start",
+    #     start_key="agent_search_start",
-        end_key="entity_term_extraction_llm",
+    #     end_key="entity_term_extraction_llm",
-    )
+    # )
    graph.add_edge(
        start_key="agent_search_start",
@@ -319,8 +324,12 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        end_key="initial_answer_quality_check",
    )
    # graph.add_edge(
    #     start_key=["initial_answer_quality_check", "entity_term_extraction_llm"],
    #     end_key="refined_answer_decision",
    # )
    graph.add_edge(
-        start_key=["initial_answer_quality_check", "entity_term_extraction_llm"],
+        start_key="initial_answer_quality_check",
        end_key="refined_answer_decision",
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/agent_search_start.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/agent_search_start.py
@@ -60,6 +60,6 @@ def agent_search_start(
    return ExploratorySearchUpdate(
        exploratory_search_results=exploratory_search_results,
        log_messages=[
-            f"--------{now_end}--{now_end - now_start}--------EXPLORATORY SEARCH END---"
+            f"{now_start} -- Main - Exploratory Search,  Time taken: {now_end - now_start}"
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/entity_term_extraction_llm.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/entity_term_extraction_llm.py
@@ -4,7 +4,6 @@ from datetime import datetime
 from typing import cast
 from langchain_core.messages import HumanMessage
 from langchain_core.messages import merge_message_runs
 from langchain_core.runnables import RunnableConfig
 from onyx.agents.agent_search.deep_search_a.main.operations import logger
@@ -32,12 +31,15 @@ def entity_term_extraction_llm(
    now_start = datetime.now()
    logger.debug(f"--------{now_start}--------GENERATE ENTITIES & TERMS---")
    logger.debug(
        f"--------{now_start}--------GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    )
    agent_a_config = cast(AgentSearchConfig, config["metadata"]["config"])
    if not agent_a_config.allow_refinement:
        now_end = datetime.now()
        return EntityTermExtractionUpdate(
-            entity_retlation_term_extractions=EntityRelationshipTermExtraction(
+            entity_relation_term_extractions=EntityRelationshipTermExtraction(
                entities=[],
                relationships=[],
                terms=[],
@@ -64,14 +66,11 @@ def entity_term_extraction_llm(
    ]
    fast_llm = agent_a_config.fast_llm
    # Grader
-    llm_response_list = list(
+    llm_response = fast_llm.invoke(
        fast_llm.stream(
        prompt=msg,
    )
    )
    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
-    cleaned_response = re.sub(r"```json\n|\n```", "", llm_response)
+    cleaned_response = re.sub(r"```json\n|\n```", "", str(llm_response.content))
    parsed_response = json.loads(cleaned_response)
    entities = []
@@ -117,14 +116,17 @@ def entity_term_extraction_llm(
    logger.debug(
        f"--------{now_end}--{now_end - now_start}--------ENTITY TERM EXTRACTION END---"
    )
    logger.debug(
        f"--------{now_end}--------GBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"
    )
    return EntityTermExtractionUpdate(
-        entity_retlation_term_extractions=EntityRelationshipTermExtraction(
+        entity_relation_term_extractions=EntityRelationshipTermExtraction(
            entities=entities,
            relationships=relationships,
            terms=terms,
        ),
        log_messages=[
-            f"{now_end} -- Main - ETR Extraction,  Time taken: {now_end - now_start}"
+            f"{now_start} -- Main - ETR Extraction,  Time taken: {now_end - now_start}"
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/refined_sub_question_creation.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/refined_sub_question_creation.py
@@ -19,9 +19,7 @@ from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
 )
 from onyx.agents.agent_search.shared_graph_utils.prompts import DEEP_DECOMPOSE_PROMPT
 from onyx.agents.agent_search.shared_graph_utils.utils import dispatch_separated
-from onyx.agents.agent_search.shared_graph_utils.utils import (
+from onyx.agents.agent_search.shared_graph_utils.utils import format_docs
    format_entity_term_extraction,
 )
 from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
 from onyx.tools.models import ToolCallKickoff
@@ -52,11 +50,13 @@ def refined_sub_question_creation(
    base_answer = state.initial_answer
    history = build_history_prompt(agent_a_config.prompt_builder)
    # get the entity term extraction dict and properly format it
-    entity_retlation_term_extractions = state.entity_retlation_term_extractions
+    # entity_retlation_term_extractions = state.entity_relation_term_extractions
-    entity_term_extraction_str = format_entity_term_extraction(
+    # entity_term_extraction_str = format_entity_term_extraction(
-        entity_retlation_term_extractions
+    #     entity_retlation_term_extractions
-    )
+    # )
    docs_str = format_docs(state.all_original_question_documents[:10])
    initial_question_answers = state.decomp_answer_results
@@ -73,7 +73,7 @@ def refined_sub_question_creation(
            content=DEEP_DECOMPOSE_PROMPT.format(
                question=question,
                history=history,
-                entity_term_extraction_str=entity_term_extraction_str,
+                docs_str=docs_str,
                base_answer=base_answer,
                answered_sub_questions="\n - ".join(addressed_question_list),
                failed_sub_questions="\n - ".join(failed_question_list),
--- a/backend/onyx/agents/agent_search/deep_search_a/main/states.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/states.py
@@ -117,7 +117,7 @@ class ExpandedRetrievalUpdate(LoggerUpdate):
 class EntityTermExtractionUpdate(LoggerUpdate):
-    entity_retlation_term_extractions: EntityRelationshipTermExtraction = (
+    entity_relation_term_extractions: EntityRelationshipTermExtraction = (
        EntityRelationshipTermExtraction()
    )
--- a/backend/onyx/agents/agent_search/models.py
+++ b/backend/onyx/agents/agent_search/models.py
@@ -52,7 +52,7 @@ class AgentSearchConfig:
    db_session: Session | None = None
    # Whether to perform initial search to inform decomposition
-    perform_initial_search_path_decision: bool = True
+    # perform_initial_search_path_decision: bool = True
    # Whether to perform initial search to inform decomposition
    perform_initial_search_decomposition: bool = True
--- a/backend/onyx/agents/agent_search/run_graph.py
+++ b/backend/onyx/agents/agent_search/run_graph.py
@@ -138,7 +138,7 @@ def run_graph(
    input: BasicInput | MainInput_a,
 ) -> AnswerStream:
    # TODO: add these to the environment
-    config.perform_initial_search_path_decision = False
+    # config.perform_initial_search_path_decision = False
    config.perform_initial_search_decomposition = True
    config.allow_refinement = True
@@ -212,7 +212,8 @@ if __name__ == "__main__":
        # query="What are the guiding principles behind the development of cockroachDB",
        # query="What are the temperatures in Munich, Hawaii, and New York?",
        # query="When was Washington born?",
-        query="What is Onyx?",
+        # query="What is Onyx?",
        query="What is the difference between astronomy and astrology?",
    )
    # Joachim custom persona
@@ -222,7 +223,7 @@ if __name__ == "__main__":
        )
        # search_request.persona = get_persona_by_id(1, None, db_session)
        config.use_persistence = True
-        config.perform_initial_search_path_decision = False
+        # config.perform_initial_search_path_decision = False
        config.perform_initial_search_decomposition = True
        if GRAPH_NAME == "a":
            input = MainInput_a(
--- a/backend/onyx/agents/agent_search/shared_graph_utils/prompts.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/prompts.py
@@ -138,7 +138,7 @@ BASE_CHECK_PROMPT = """ \n
 VERIFIER_PROMPT = """
 You are supposed to judge whether a document text contains data or information that is potentially relevant
 for a question. It does not have to be fully relevant, but check whether it has some information that
-could help to address the question.
+would help - possibly in conjunction with other documents - to address the question.
 Here is a document text that you can take as a fact:
 --
@@ -147,8 +147,7 @@ DOCUMENT INFORMATION:
 --
 Do you think that this document text is useful and relevant to answer the following question?
-(Other documents may supply additional information, so do not worry if the provided information
+
 is not enough to answer the question, but it needs to be relevant to the question.)
 --
 QUESTION:
 {question}
@@ -295,6 +294,92 @@ DEEP_DECOMPOSE_PROMPT = """ \n
    Your role is to generate 2-4 new sub-questions that would help to answer the initial question,
    considering:
    1) The initial question
    2) The initial answer that was found to be unsatisfactory
    3) The sub-questions that were answered
    4) The sub-questions that were suggested but not answered
    5) A sample of the TYPE of documents that may be in the databse in order to inform
    you what type of entities, relationships, and terms you may want to consider asking about.
    (But do not build the questions strictly on these documents! They are only examples!
    Take the, as illustrations.)
    The individual questions should be answerable by a good RAG system.
    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
    question for different entities that may be involved in the original question, but in a way that does
    not duplicate questions that were already tried.
    Additional Guidelines:
    - The sub-questions should be specific to the question and provide richer context for the question,
    resolve ambiguities, or address shortcoming of the initial answer
    - Each sub-question - when answered - should be relevant for the answer to the original question
    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
    other complications that may require extra context.
    - The sub-questions MUST have the full context of the original question so that it can be executed by
    a RAG system independently without the original question available
      (Example:
        - initial question: "What is the capital of France?"
        - bad sub-question: "What is the name of the river there?"
        - good sub-question: "What is the name of the river that flows through Paris?"
    - For each sub-question, please also provide a search term that can be used to retrieve relevant
    documents from a document store.
    - Consider specifically the sub-questions that were suggested but not answered. This is a sign that they are not
    answerable with the available context, and you should not ask similar questions.
    \n\n
    Here is the initial question:
    \n ------- \n
    {question}
    \n ------- \n
    {history}
    Here is the initial sub-optimal answer:
    \n ------- \n
    {base_answer}
    \n ------- \n
    Here are the sub-questions that were answered:
    \n ------- \n
    {answered_sub_questions}
    \n ------- \n
    Here are the sub-questions that were suggested but not answered:
    \n ------- \n
    {failed_sub_questions}
    \n ------- \n
    And here some reference documents that show you what type of entities, relationships,
    and terms you may want to consider toask about as relevamt to your initial question.
    \n ------- \n
    {docs_str}
    \n ------- \n
   Please generate the list of good, fully contextualized sub-questions that would help to address the
   main question.
   Specifically pay attention also to the entities, relationships and terms extracted, as these indicate what type of
   objects/relationships/terms you can ask about! Do not ask about entities, terms or relationships that are not
   mentioned in the 'entities, relationships and terms' section.
   Again, please find questions that are NOT overlapping too much with the already answered
   sub-questions or those that already were suggested and failed.
   In other words - what can we try in addition to what has been tried so far?
   Generate the list of questions separated by one new line like this:
 <sub-question 1>
 <sub-question 2>
 <sub-question 3>
   ...
   """
 DEEP_DECOMPOSE_PROMPT_WITH_ENTITIES = """ \n
    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
    good enough. Also, some sub-questions had been answered and this information has been used to provide
    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
    you have an idea of how the avaiolable data looks like.
    Your role is to generate 2-4 new sub-questions that would help to answer the initial question,
    considering:
    1) The initial question
    2) The initial answer that was found to be unsatisfactory
    3) The sub-questions that were answered