Document explorer admin page (#590)

2025-09-26 20:08:38 +02:00 · 2023-10-18 18:41:39 -07:00
parent a5d2759fbc
commit 1bd76f528f
20 changed files with 447 additions and 89 deletions
--- a/backend/danswer/chunking/models.py
+++ b/backend/danswer/chunking/models.py
@@ -93,6 +93,7 @@ class InferenceChunk(BaseChunk):
    semantic_identifier: str
    boost: int
    score: float | None
+    hidden: bool
    metadata: dict[str, Any]
    # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -8,6 +8,7 @@ SOURCE_TYPE = "source_type"
 SOURCE_LINKS = "source_links"
 SOURCE_LINK = "link"
 SEMANTIC_IDENTIFIER = "semantic_identifier"
+TITLE = "title"
 SECTION_CONTINUATION = "section_continuation"
 EMBEDDINGS = "embeddings"
 ALLOWED_USERS = "allowed_users"
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@@ -24,8 +24,14 @@ class Document:
    id: str  # This must be unique or during indexing/reindexing, chunks will be overwritten
    sections: list[Section]
    source: DocumentSource
-    semantic_identifier: str
+    semantic_identifier: str  # displayed in the UI as the main identifier for the doc
    metadata: dict[str, Any]
+    # `title` is used when computing best matches for a query
+    # if `None`, then we will use the `semantic_identifier` as the title in Vespa
+    title: str | None = None
+
+    def get_title_for_document_index(self) -> str:
+        return self.semantic_identifier if self.title is None else self.title

    def to_short_descriptor(self) -> str:
        """Used when logging the identity of a document"""
--- a/backend/danswer/connectors/slack/connector.py
+++ b/backend/danswer/connectors/slack/connector.py
@@ -148,6 +148,7 @@ def thread_to_doc(
        ],
        source=DocumentSource.SLACK,
        semantic_identifier=channel["name"],
+        title="",  # slack docs don't really have a "title"
        metadata={},
    )

@@ -302,6 +303,7 @@ class SlackLoadConnector(LoadConnector):
                    ],
                    source=matching_doc.source,
                    semantic_identifier=matching_doc.semantic_identifier,
+                    title="",  # slack docs don't really have a "title"
                    metadata=matching_doc.metadata,
                )

@@ -319,6 +321,7 @@ class SlackLoadConnector(LoadConnector):
                ],
                source=DocumentSource.SLACK,
                semantic_identifier=channel["name"],
+                title="",  # slack docs don't really have a "title"
                metadata={},
            )

--- a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd
@@ -37,9 +37,21 @@ schema danswer_chunk {
        field source_links type string {
            indexing: summary | attribute
        }
+        # displayed in the UI as the main identifier for the doc
        field semantic_identifier type string {
            indexing: summary | attribute
        }
+        # this is used when computing best matches based on the title of the document
+        # may not always match the `semantic_identifier` e.g. for Slack docs the 
+        # `semantic_identifier` will be the channel name, but the `title` will be empty
+        field title type string {
+            indexing: summary | index
+            match {
+                gram
+                gram-size: 3
+            }
+            index: enable-bm25
+        }
        field section_continuation type bool {
            indexing: summary | attribute
        }
@@ -70,7 +82,7 @@ schema danswer_chunk {
    }

    fieldset default {
-        fields: content
+        fields: content, title
    }

    rank-profile keyword_search inherits default {
@@ -103,4 +115,11 @@ schema danswer_chunk {
        }
        match-features: closest(embeddings)
    }
+
+    # used when searching from the admin UI for a specific doc to hide / boost
+    rank-profile admin_search inherits default {
+        first-phase {
+            expression: bm25(content) + (100 * bm25(title))
+        }
+    }
 }
--- a/backend/danswer/datastores/vespa/store.py
+++ b/backend/danswer/datastores/vespa/store.py
@@ -38,6 +38,7 @@ from danswer.configs.constants import SECTION_CONTINUATION
 from danswer.configs.constants import SEMANTIC_IDENTIFIER
 from danswer.configs.constants import SOURCE_LINKS
 from danswer.configs.constants import SOURCE_TYPE
+from danswer.configs.constants import TITLE
 from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
 from danswer.datastores.datastore_utils import get_uuid_from_chunk
 from danswer.datastores.interfaces import DocumentIndex
@@ -166,6 +167,7 @@ def _index_vespa_chunk(
        SOURCE_TYPE: str(document.source.value),
        SOURCE_LINKS: json.dumps(chunk.source_links),
        SEMANTIC_IDENTIFIER: document.semantic_identifier,
+        TITLE: document.get_title_for_document_index(),
        SECTION_CONTINUATION: chunk.section_continuation,
        METADATA: json.dumps(document.metadata),
        EMBEDDINGS: embeddings_name_vector_map,
@@ -264,7 +266,9 @@ def _index_vespa_chunks(
    return insertion_records


-def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
+def _build_vespa_filters(
+    filters: list[IndexFilter] | None, include_hidden: bool = False
+) -> str:
    # NOTE: permissions filters are expected to be passed in directly via
    # the `filters` arg, which is why they are not considered explicitly here

@@ -272,8 +276,9 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
    # via the `filters` arg. These are set either in the Web UI or in the Slack
    # listener

-    # ignore hidden docs
-    filter_str = f"!({HIDDEN}=true) and "
+    # usually ignore hidden docs unless explicitly requested. We may want to
+    # get hidden docs on the admin panel to allow for un-hiding
+    filter_str = f"!({HIDDEN}=true) and " if include_hidden else ""

    # Handle provided query filters
    if filters:
@@ -389,6 +394,7 @@ class VespaIndex(DocumentIndex):
        f"{SEMANTIC_IDENTIFIER}, "
        f"{SECTION_CONTINUATION}, "
        f"{BOOST}, "
+        f"{HIDDEN}, "
        f"{METADATA} "
        f"{CONTENT_SUMMARY} "
        f"from {DOCUMENT_INDEX_NAME} where "
@@ -604,3 +610,32 @@ class VespaIndex(DocumentIndex):
        }

        return _query_vespa(params)
+
+    def admin_retrieval(
+        self,
+        query: str,
+        user_id: UUID | None,
+        filters: list[IndexFilter] | None,
+        num_to_retrieve: int = NUM_RETURNED_HITS,
+    ) -> list[InferenceChunk]:
+        vespa_where_clauses = _build_vespa_filters(filters)
+        yql = (
+            VespaIndex.yql_base
+            + vespa_where_clauses
+            + '({grammar: "weakAnd"}userInput(@query) '
+            # `({defaultIndex: "content_summary"}userInput(@query))` section is
+            # needed for highlighting while the N-gram highlighting is broken /
+            # not working as desired
+            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+            + _build_vespa_limit(num_to_retrieve)
+        )
+
+        params: dict[str, str | int] = {
+            "yql": yql,
+            "query": query,
+            "hits": num_to_retrieve,
+            "num_to_rerank": 10 * num_to_retrieve,
+            "ranking.profile": "admin_search",
+        }
+
+        return _query_vespa(params)
--- a/backend/danswer/search/semantic_search.py
+++ b/backend/danswer/search/semantic_search.py
@@ -49,6 +49,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
                blurb=chunk.blurb,
                source_type=chunk.source_type,
                boost=chunk.boost,
+                hidden=chunk.hidden,
                score=chunk.score,
                match_highlights=chunk.match_highlights,
            )
--- a/backend/danswer/server/models.py
+++ b/backend/danswer/server/models.py
@@ -147,6 +147,10 @@ class SearchDoc(BaseModel):
    blurb: str
    source_type: str
    boost: int
+    # whether the document is hidden when doing a standard search
+    # since a standard search will never find a hidden doc, this can only ever
+    # be `True` when doing an admin search
+    hidden: bool
    score: float | None
    # Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
--- a/backend/danswer/server/search_backend.py
+++ b/backend/danswer/server/search_backend.py
@@ -2,15 +2,20 @@ from collections.abc import Generator

 from fastapi import APIRouter
 from fastapi import Depends
+from fastapi import HTTPException
 from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
 from sqlalchemy.orm import Session

+from danswer.auth.users import current_admin_user
 from danswer.auth.users import current_user
 from danswer.chunking.models import InferenceChunk
 from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
 from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
 from danswer.configs.constants import IGNORE_FOR_QA
 from danswer.datastores.document_index import get_default_document_index
+from danswer.datastores.interfaces import IndexFilter
+from danswer.datastores.vespa.store import VespaIndex
 from danswer.db.engine import get_session
 from danswer.db.feedback import create_doc_retrieval_feedback
 from danswer.db.feedback import create_query_event
@@ -38,6 +43,7 @@ from danswer.server.models import QAResponse
 from danswer.server.models import QueryValidationResponse
 from danswer.server.models import QuestionRequest
 from danswer.server.models import RerankedRetrievalDocs
+from danswer.server.models import SearchDoc
 from danswer.server.models import SearchFeedbackRequest
 from danswer.server.models import SearchResponse
 from danswer.server.utils import get_json_line
@@ -49,6 +55,57 @@ logger = setup_logger()
 router = APIRouter()


+"""Admin-only search endpoints"""
+
+
+class AdminSearchRequest(BaseModel):
+    query: str
+    filters: list[IndexFilter] | None = None
+
+
+class AdminSearchResponse(BaseModel):
+    documents: list[SearchDoc]
+
+
+@router.post("/admin/search")
+def admin_search(
+    question: AdminSearchRequest,
+    user: User | None = Depends(current_admin_user),
+    db_session: Session = Depends(get_session),
+) -> AdminSearchResponse:
+    query = question.query
+    filters = question.filters
+    logger.info(f"Received admin search query: {query}")
+
+    user_id = None if user is None else user.id
+    user_acl_filters = build_access_filters_for_user(user, db_session)
+    final_filters = (filters or []) + user_acl_filters
+    document_index = get_default_document_index()
+    if not isinstance(document_index, VespaIndex):
+        raise HTTPException(
+            status_code=400,
+            detail="Cannot use admin-search when using a non-Vespa document index",
+        )
+
+    matching_chunks = document_index.admin_retrieval(
+        query=query, user_id=user_id, filters=final_filters
+    )
+
+    documents = chunks_to_search_docs(matching_chunks)
+
+    # deduplicate documents by id
+    deduplicated_documents: list[SearchDoc] = []
+    seen_documents: set[str] = set()
+    for document in documents:
+        if document.document_id not in seen_documents:
+            deduplicated_documents.append(document)
+            seen_documents.add(document.document_id)
+    return AdminSearchResponse(documents=deduplicated_documents)
+
+
+"""Search endpoints for all"""
+
+
@router.post("/search-intent")
 def get_search_type(
    question: QuestionRequest, _: User = Depends(current_user)
--- a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py
+++ b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py
@@ -113,6 +113,7 @@ class TestQAPostprocessing(unittest.TestCase):
            semantic_identifier="anything",
            section_continuation=False,
            boost=0,
+            hidden=False,
            score=1,
            metadata={},
            match_highlights=[],
@@ -127,6 +128,7 @@ class TestQAPostprocessing(unittest.TestCase):
            semantic_identifier="whatever",
            section_continuation=False,
            boost=0,
+            hidden=False,
            score=1,
            metadata={},
            match_highlights=[],