Provide Metadata to the LLM (#740)

2025-09-26 11:58:28 +02:00 · 2023-11-19 12:28:45 -08:00
parent 6fb07d20cc
commit f72825cd46
5 changed files with 48 additions and 15 deletions
--- a/backend/danswer/direct_qa/qa_block.py
+++ b/backend/danswer/direct_qa/qa_block.py
@@ -65,6 +65,38 @@ class QAHandler(abc.ABC):
        )


+# Maps connector enum string to a more natural language representation for the LLM
+# If not on the list, uses the original but slightly cleaned up, see below
+CONNECTOR_NAME_MAP = {
+    "web": "Website",
+    "requesttracker": "Request Tracker",
+    "github": "GitHub",
+    "file": "File Upload",
+}
+
+
+def clean_up_source(source_str: str) -> str:
+    if source_str in CONNECTOR_NAME_MAP:
+        return CONNECTOR_NAME_MAP[source_str]
+    return source_str.replace("_", " ").title()
+
+
+def build_context_str(
+    context_chunks: list[InferenceChunk],
+    include_metadata: bool = True,
+) -> str:
+    context = ""
+    for chunk in context_chunks:
+        if include_metadata:
+            context += f"NEW DOCUMENT: {chunk.semantic_identifier}\n"
+            context += f"Source: {clean_up_source(chunk.source_type)}\n"
+            if chunk.updated_at:
+                update_str = chunk.updated_at.strftime("%B %d, %Y %H:%M")
+                context += f"Updated: {update_str}\n"
+        context += f"{CODE_BLOCK_PAT.format(chunk.content.strip())}\n\n\n"
+    return context.strip()
+
+
 class WeakLLMQAHandler(QAHandler):
    """Since Danswer supports a variety of LLMs, this less demanding prompt is provided
    as an option to use with weaker LLMs such as small version, low float precision, quantized,
@@ -95,9 +127,7 @@ class SingleMessageQAHandler(QAHandler):
        context_chunks: list[InferenceChunk],
        use_language_hint: bool = bool(MULTILINGUAL_QUERY_EXPANSION),
    ) -> list[BaseMessage]:
-        context_docs_str = "\n".join(
-            f"\n{CODE_BLOCK_PAT.format(c.content)}\n" for c in context_chunks
-        )
+        context_docs_str = build_context_str(context_chunks)

        single_message = JSON_PROMPT.format(
            context_docs_str=context_docs_str,
@@ -123,9 +153,7 @@ class SingleMessageScratchpadHandler(QAHandler):
        context_chunks: list[InferenceChunk],
        use_language_hint: bool = bool(MULTILINGUAL_QUERY_EXPANSION),
    ) -> list[BaseMessage]:
-        context_docs_str = "\n".join(
-            f"\n{CODE_BLOCK_PAT.format(c.content)}\n" for c in context_chunks
-        )
+        context_docs_str = build_context_str(context_chunks)

        single_message = COT_PROMPT.format(
            context_docs_str=context_docs_str,
--- a/backend/danswer/direct_qa/qa_utils.py
+++ b/backend/danswer/direct_qa/qa_utils.py
@@ -349,7 +349,8 @@ def get_chunks_for_qa(

            # We calculate it live in case the user uses a different LLM + tokenizer
            chunk_token = check_number_of_tokens(chunk.content)
-            token_count += chunk_token
+            # 50 for an approximate/slight overestimate for # tokens for metadata for the chunk
+            token_count += chunk_token + 50

            # Always use at least 1 chunk
            if token_count <= token_limit or not latest_batch_indices:
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@@ -80,7 +80,7 @@ class DocMetadataAwareIndexChunk(IndexChunk):
@dataclass
 class InferenceChunk(BaseChunk):
    document_id: str
-    source_type: str
+    source_type: str  # This is the string value of the enum already like "web"
    semantic_identifier: str
    boost: int
    recency_bias: float
--- a/backend/danswer/prompts/constants.py
+++ b/backend/danswer/prompts/constants.py
@@ -1,4 +1,4 @@
-GENERAL_SEP_PAT = "-----"
+GENERAL_SEP_PAT = "--------------"  # Same length as Langchain's separator
 CODE_BLOCK_PAT = "```\n{}\n```"
 QUESTION_PAT = "Query:"
 THOUGHT_PAT = "Thought:"
--- a/backend/danswer/prompts/direct_qa_prompts.py
+++ b/backend/danswer/prompts/direct_qa_prompts.py
@@ -49,15 +49,17 @@ ANSWER_NOT_FOUND_RESPONSE = f'{{"answer": "{UNCERTAINTY_PAT}", "quotes": []}}'
 JSON_PROMPT = f"""
 {QA_HEADER}
 {REQUIRE_JSON}
-{GENERAL_SEP_PAT}
+
 CONTEXT:
+{GENERAL_SEP_PAT}
 {{context_docs_str}}
 {GENERAL_SEP_PAT}
+
 SAMPLE_RESPONSE:
 ```
 {{{json.dumps(EMPTY_SAMPLE_JSON)}}}
 ```
-{QUESTION_PAT} {{user_query}}
+{QUESTION_PAT.upper()} {{user_query}}
 {JSON_HELPFUL_HINT}
 {{language_hint_or_none}}
 """.strip()
@@ -68,10 +70,12 @@ SAMPLE_RESPONSE:
 # COT (chain-of-thought) flow basically
 COT_PROMPT = f"""
 {QA_HEADER}
-{GENERAL_SEP_PAT}
+
 CONTEXT:
+{GENERAL_SEP_PAT}
 {{context_docs_str}}
 {GENERAL_SEP_PAT}
+
 You MUST respond in the following format:
 ```
 {THOUGHT_PAT} Use this section as a scratchpad to reason through the answer.
@@ -79,7 +83,7 @@ You MUST respond in the following format:
 {{{json.dumps(EMPTY_SAMPLE_JSON)}}}
 ```

-{QUESTION_PAT} {{user_query}}
+{QUESTION_PAT.upper()} {{user_query}}
 {JSON_HELPFUL_HINT}
 {{language_hint_or_none}}
 """.strip()
@@ -96,8 +100,8 @@ Answer the user query below based on the reference document above.
 Respond with an "{ANSWER_PAT}" section and as many "{QUOTE_PAT}" sections as needed to support \
 the answer.'

-{QUESTION_PAT} {{user_query}}
-{ANSWER_PAT}
+{QUESTION_PAT.upper()} {{user_query}}
+{ANSWER_PAT.upper()}
 """.strip()