taking out Extraction for now

2025-09-20 13:05:49 +02:00 · 2025-01-24 15:37:48 -08:00
parent fc60fd0322
commit aa8cb44a33
9 changed files with 135 additions and 38 deletions
--- a/backend/onyx/agents/agent_search/deep_search_a/answer_initial_sub_question/nodes/answer_check.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/answer_initial_sub_question/nodes/answer_check.py
@@ -27,7 +27,7 @@ def answer_check(state: AnswerQuestionState, config: RunnableConfig) -> QACheckU
        return QACheckUpdate(
            answer_quality=SUB_CHECK_NO,
            log_messages=[
-                f"{now_end} -- Answer check SQ-{level}-{question_num} - unknown answer,  Time taken: {now_end - now_start}"
+                f"{now_start} -- Answer check SQ-{level}-{question_num} - unknown answer,  Time taken: {now_end - now_start}"
            ],
        )
    msg = [
@@ -53,7 +53,7 @@ def answer_check(state: AnswerQuestionState, config: RunnableConfig) -> QACheckU
    return QACheckUpdate(
        answer_quality=quality_str,
        log_messages=[
-            f"""{now_end} -- Answer check SQ-{level}-{question_num} - Answer quality: {quality_str},
+            f"""{now_start} -- Answer check SQ-{level}-{question_num} - Answer quality: {quality_str},
 Time taken: {now_end - now_start}"""
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/graph_builder.py
@@ -32,9 +32,14 @@ from onyx.agents.agent_search.deep_search_a.main.nodes.agent_search_start import
 from onyx.agents.agent_search.deep_search_a.main.nodes.answer_comparison import (
    answer_comparison,
 )
+
 from onyx.agents.agent_search.deep_search_a.main.nodes.entity_term_extraction_llm import (
    entity_term_extraction_llm,
 )
+from onyx.agents.agent_search.deep_search_a.main.nodes.direct_llm_handling import (
+    direct_llm_handling,
+
+)
 from onyx.agents.agent_search.deep_search_a.main.nodes.generate_initial_answer import (
    generate_initial_answer,
 )
@@ -197,10 +202,10 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        action=initial_answer_quality_check,
    )

-    graph.add_node(
-        node="entity_term_extraction_llm",
-        action=entity_term_extraction_llm,
-    )
+    # graph.add_node(
+    #     node="entity_term_extraction_llm",
+    #     action=entity_term_extraction_llm,
+    # )
    graph.add_node(
        node="refined_answer_decision",
        action=refined_answer_decision,
@@ -259,10 +264,10 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        end_key="base_raw_search_subgraph",
    )

-    graph.add_edge(
-        start_key="agent_search_start",
-        end_key="entity_term_extraction_llm",
-    )
+    # graph.add_edge(
+    #     start_key="agent_search_start",
+    #     end_key="entity_term_extraction_llm",
+    # )

    graph.add_edge(
        start_key="agent_search_start",
@@ -319,8 +324,12 @@ def main_graph_builder(test_mode: bool = False) -> StateGraph:
        end_key="initial_answer_quality_check",
    )

+    # graph.add_edge(
+    #     start_key=["initial_answer_quality_check", "entity_term_extraction_llm"],
+    #     end_key="refined_answer_decision",
+    # )
    graph.add_edge(
-        start_key=["initial_answer_quality_check", "entity_term_extraction_llm"],
+        start_key="initial_answer_quality_check",
        end_key="refined_answer_decision",
    )

--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/agent_search_start.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/agent_search_start.py
@@ -60,6 +60,6 @@ def agent_search_start(
    return ExploratorySearchUpdate(
        exploratory_search_results=exploratory_search_results,
        log_messages=[
-            f"--------{now_end}--{now_end - now_start}--------EXPLORATORY SEARCH END---"
+            f"{now_start} -- Main - Exploratory Search,  Time taken: {now_end - now_start}"
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/entity_term_extraction_llm.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/entity_term_extraction_llm.py
@@ -4,7 +4,6 @@ from datetime import datetime
 from typing import cast

 from langchain_core.messages import HumanMessage
-from langchain_core.messages import merge_message_runs
 from langchain_core.runnables import RunnableConfig

 from onyx.agents.agent_search.deep_search_a.main.operations import logger
@@ -32,12 +31,15 @@ def entity_term_extraction_llm(
    now_start = datetime.now()

    logger.debug(f"--------{now_start}--------GENERATE ENTITIES & TERMS---")
+    logger.debug(
+        f"--------{now_start}--------GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+    )

    agent_a_config = cast(AgentSearchConfig, config["metadata"]["config"])
    if not agent_a_config.allow_refinement:
        now_end = datetime.now()
        return EntityTermExtractionUpdate(
-            entity_retlation_term_extractions=EntityRelationshipTermExtraction(
+            entity_relation_term_extractions=EntityRelationshipTermExtraction(
                entities=[],
                relationships=[],
                terms=[],
@@ -64,14 +66,11 @@ def entity_term_extraction_llm(
    ]
    fast_llm = agent_a_config.fast_llm
    # Grader
-    llm_response_list = list(
-        fast_llm.stream(
-            prompt=msg,
-        )
+    llm_response = fast_llm.invoke(
+        prompt=msg,
    )
-    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content

-    cleaned_response = re.sub(r"```json\n|\n```", "", llm_response)
+    cleaned_response = re.sub(r"```json\n|\n```", "", str(llm_response.content))
    parsed_response = json.loads(cleaned_response)

    entities = []
@@ -117,14 +116,17 @@ def entity_term_extraction_llm(
    logger.debug(
        f"--------{now_end}--{now_end - now_start}--------ENTITY TERM EXTRACTION END---"
    )
+    logger.debug(
+        f"--------{now_end}--------GBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"
+    )

    return EntityTermExtractionUpdate(
-        entity_retlation_term_extractions=EntityRelationshipTermExtraction(
+        entity_relation_term_extractions=EntityRelationshipTermExtraction(
            entities=entities,
            relationships=relationships,
            terms=terms,
        ),
        log_messages=[
-            f"{now_end} -- Main - ETR Extraction,  Time taken: {now_end - now_start}"
+            f"{now_start} -- Main - ETR Extraction,  Time taken: {now_end - now_start}"
        ],
    )
--- a/backend/onyx/agents/agent_search/deep_search_a/main/nodes/refined_sub_question_creation.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/nodes/refined_sub_question_creation.py
@@ -19,9 +19,7 @@ from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
 )
 from onyx.agents.agent_search.shared_graph_utils.prompts import DEEP_DECOMPOSE_PROMPT
 from onyx.agents.agent_search.shared_graph_utils.utils import dispatch_separated
-from onyx.agents.agent_search.shared_graph_utils.utils import (
-    format_entity_term_extraction,
-)
+from onyx.agents.agent_search.shared_graph_utils.utils import format_docs
 from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
 from onyx.tools.models import ToolCallKickoff

@@ -52,11 +50,13 @@ def refined_sub_question_creation(
    base_answer = state.initial_answer
    history = build_history_prompt(agent_a_config.prompt_builder)
    # get the entity term extraction dict and properly format it
-    entity_retlation_term_extractions = state.entity_retlation_term_extractions
+    # entity_retlation_term_extractions = state.entity_relation_term_extractions

-    entity_term_extraction_str = format_entity_term_extraction(
-        entity_retlation_term_extractions
-    )
+    # entity_term_extraction_str = format_entity_term_extraction(
+    #     entity_retlation_term_extractions
+    # )
+
+    docs_str = format_docs(state.all_original_question_documents[:10])

    initial_question_answers = state.decomp_answer_results

@@ -73,7 +73,7 @@ def refined_sub_question_creation(
            content=DEEP_DECOMPOSE_PROMPT.format(
                question=question,
                history=history,
-                entity_term_extraction_str=entity_term_extraction_str,
+                docs_str=docs_str,
                base_answer=base_answer,
                answered_sub_questions="\n - ".join(addressed_question_list),
                failed_sub_questions="\n - ".join(failed_question_list),
--- a/backend/onyx/agents/agent_search/deep_search_a/main/states.py
+++ b/backend/onyx/agents/agent_search/deep_search_a/main/states.py
@@ -117,7 +117,7 @@ class ExpandedRetrievalUpdate(LoggerUpdate):


 class EntityTermExtractionUpdate(LoggerUpdate):
-    entity_retlation_term_extractions: EntityRelationshipTermExtraction = (
+    entity_relation_term_extractions: EntityRelationshipTermExtraction = (
        EntityRelationshipTermExtraction()
    )

--- a/backend/onyx/agents/agent_search/models.py
+++ b/backend/onyx/agents/agent_search/models.py
@@ -52,7 +52,7 @@ class AgentSearchConfig:
    db_session: Session | None = None

    # Whether to perform initial search to inform decomposition
-    perform_initial_search_path_decision: bool = True
+    # perform_initial_search_path_decision: bool = True

    # Whether to perform initial search to inform decomposition
    perform_initial_search_decomposition: bool = True
--- a/backend/onyx/agents/agent_search/run_graph.py
+++ b/backend/onyx/agents/agent_search/run_graph.py
@@ -138,7 +138,7 @@ def run_graph(
    input: BasicInput | MainInput_a,
 ) -> AnswerStream:
    # TODO: add these to the environment
-    config.perform_initial_search_path_decision = False
+    # config.perform_initial_search_path_decision = False
    config.perform_initial_search_decomposition = True
    config.allow_refinement = True

@@ -212,7 +212,8 @@ if __name__ == "__main__":
        # query="What are the guiding principles behind the development of cockroachDB",
        # query="What are the temperatures in Munich, Hawaii, and New York?",
        # query="When was Washington born?",
-        query="What is Onyx?",
+        # query="What is Onyx?",
+        query="What is the difference between astronomy and astrology?",
    )
    # Joachim custom persona

@@ -222,7 +223,7 @@ if __name__ == "__main__":
        )
        # search_request.persona = get_persona_by_id(1, None, db_session)
        config.use_persistence = True
-        config.perform_initial_search_path_decision = False
+        # config.perform_initial_search_path_decision = False
        config.perform_initial_search_decomposition = True
        if GRAPH_NAME == "a":
            input = MainInput_a(
--- a/backend/onyx/agents/agent_search/shared_graph_utils/prompts.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/prompts.py
@@ -138,7 +138,7 @@ BASE_CHECK_PROMPT = """ \n
 VERIFIER_PROMPT = """
 You are supposed to judge whether a document text contains data or information that is potentially relevant
 for a question. It does not have to be fully relevant, but check whether it has some information that
-could help to address the question.
+would help - possibly in conjunction with other documents - to address the question.

 Here is a document text that you can take as a fact:
 --
@@ -147,8 +147,7 @@ DOCUMENT INFORMATION:
 --

 Do you think that this document text is useful and relevant to answer the following question?
-(Other documents may supply additional information, so do not worry if the provided information
-is not enough to answer the question, but it needs to be relevant to the question.)
+
 --
 QUESTION:
 {question}
@@ -295,6 +294,92 @@ DEEP_DECOMPOSE_PROMPT = """ \n
    Your role is to generate 2-4 new sub-questions that would help to answer the initial question,
    considering:

+    1) The initial question
+    2) The initial answer that was found to be unsatisfactory
+    3) The sub-questions that were answered
+    4) The sub-questions that were suggested but not answered
+    5) A sample of the TYPE of documents that may be in the databse in order to inform
+    you what type of entities, relationships, and terms you may want to consider asking about.
+    (But do not build the questions strictly on these documents! They are only examples!
+    Take the, as illustrations.)
+
+    The individual questions should be answerable by a good RAG system.
+    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question, but in a way that does
+    not duplicate questions that were already tried.
+
+    Additional Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    resolve ambiguities, or address shortcoming of the initial answer
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please also provide a search term that can be used to retrieve relevant
+    documents from a document store.
+    - Consider specifically the sub-questions that were suggested but not answered. This is a sign that they are not
+    answerable with the available context, and you should not ask similar questions.
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    {history}
+
+    Here is the initial sub-optimal answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+
+    Here are the sub-questions that were answered:
+    \n ------- \n
+    {answered_sub_questions}
+    \n ------- \n
+
+    Here are the sub-questions that were suggested but not answered:
+    \n ------- \n
+    {failed_sub_questions}
+    \n ------- \n
+
+    And here some reference documents that show you what type of entities, relationships,
+    and terms you may want to consider toask about as relevamt to your initial question.
+    \n ------- \n
+    {docs_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question.
+
+   Specifically pay attention also to the entities, relationships and terms extracted, as these indicate what type of
+   objects/relationships/terms you can ask about! Do not ask about entities, terms or relationships that are not
+   mentioned in the 'entities, relationships and terms' section.
+
+   Again, please find questions that are NOT overlapping too much with the already answered
+   sub-questions or those that already were suggested and failed.
+   In other words - what can we try in addition to what has been tried so far?
+
+   Generate the list of questions separated by one new line like this:
+<sub-question 1>
+<sub-question 2>
+<sub-question 3>
+   ...
+   """
+
+DEEP_DECOMPOSE_PROMPT_WITH_ENTITIES = """ \n
+    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
+    good enough. Also, some sub-questions had been answered and this information has been used to provide
+    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
+    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
+    you have an idea of how the avaiolable data looks like.
+
+    Your role is to generate 2-4 new sub-questions that would help to answer the initial question,
+    considering:
+
    1) The initial question
    2) The initial answer that was found to be unsatisfactory
    3) The sub-questions that were answered