From 1bd76f528f5cc6dd7feb19d4d17b4976fbbb57cd Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Wed, 18 Oct 2023 18:41:39 -0700 Subject: [PATCH] Document explorer admin page (#590) --- backend/danswer/chunking/models.py | 1 + backend/danswer/configs/constants.py | 1 + backend/danswer/connectors/models.py | 8 +- backend/danswer/connectors/slack/connector.py | 3 + .../vespa/app_config/schemas/danswer_chunk.sd | 21 +- backend/danswer/datastores/vespa/store.py | 41 +++- backend/danswer/search/semantic_search.py | 1 + backend/danswer/server/models.py | 4 + backend/danswer/server/search_backend.py | 57 +++++ .../unit/danswer/direct_qa/test_qa_utils.py | 2 + web/src/app/admin/documents/ScoreEditor.tsx | 89 ++++++++ web/src/app/admin/documents/explorer/lib.ts | 12 ++ web/src/app/admin/documents/explorer/page.tsx | 195 ++++++++++++++++++ .../feedback/DocumentFeedbackTable.tsx | 85 +------- .../app/admin/documents/{feedback => }/lib.ts | 0 web/src/components/CustomCheckbox.tsx | 1 + web/src/components/admin/Layout.tsx | 10 + web/src/components/search/DocumentDisplay.tsx | 2 +- web/src/components/search/SearchBar.tsx | 2 +- web/src/lib/search/interfaces.ts | 1 + 20 files changed, 447 insertions(+), 89 deletions(-) create mode 100644 web/src/app/admin/documents/ScoreEditor.tsx create mode 100644 web/src/app/admin/documents/explorer/lib.ts create mode 100644 web/src/app/admin/documents/explorer/page.tsx rename web/src/app/admin/documents/{feedback => }/lib.ts (100%) diff --git a/backend/danswer/chunking/models.py b/backend/danswer/chunking/models.py index cf8337284..71de59adb 100644 --- a/backend/danswer/chunking/models.py +++ b/backend/danswer/chunking/models.py @@ -93,6 +93,7 @@ class InferenceChunk(BaseChunk): semantic_identifier: str boost: int score: float | None + hidden: bool metadata: dict[str, Any] # Matched sections in the chunk. Uses Vespa syntax e.g. TEXT # to specify that a set of words should be highlighted. For example: diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index e5330cce4..1e338a528 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -8,6 +8,7 @@ SOURCE_TYPE = "source_type" SOURCE_LINKS = "source_links" SOURCE_LINK = "link" SEMANTIC_IDENTIFIER = "semantic_identifier" +TITLE = "title" SECTION_CONTINUATION = "section_continuation" EMBEDDINGS = "embeddings" ALLOWED_USERS = "allowed_users" diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 65213fc95..41fc26060 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -24,8 +24,14 @@ class Document: id: str # This must be unique or during indexing/reindexing, chunks will be overwritten sections: list[Section] source: DocumentSource - semantic_identifier: str + semantic_identifier: str # displayed in the UI as the main identifier for the doc metadata: dict[str, Any] + # `title` is used when computing best matches for a query + # if `None`, then we will use the `semantic_identifier` as the title in Vespa + title: str | None = None + + def get_title_for_document_index(self) -> str: + return self.semantic_identifier if self.title is None else self.title def to_short_descriptor(self) -> str: """Used when logging the identity of a document""" diff --git a/backend/danswer/connectors/slack/connector.py b/backend/danswer/connectors/slack/connector.py index ed4c46e0e..53e0fee5a 100644 --- a/backend/danswer/connectors/slack/connector.py +++ b/backend/danswer/connectors/slack/connector.py @@ -148,6 +148,7 @@ def thread_to_doc( ], source=DocumentSource.SLACK, semantic_identifier=channel["name"], + title="", # slack docs don't really have a "title" metadata={}, ) @@ -302,6 +303,7 @@ class SlackLoadConnector(LoadConnector): ], source=matching_doc.source, semantic_identifier=matching_doc.semantic_identifier, + title="", # slack docs don't really have a "title" metadata=matching_doc.metadata, ) @@ -319,6 +321,7 @@ class SlackLoadConnector(LoadConnector): ], source=DocumentSource.SLACK, semantic_identifier=channel["name"], + title="", # slack docs don't really have a "title" metadata={}, ) diff --git a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd index c0d026f9a..06d39ca23 100644 --- a/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/datastores/vespa/app_config/schemas/danswer_chunk.sd @@ -37,9 +37,21 @@ schema danswer_chunk { field source_links type string { indexing: summary | attribute } + # displayed in the UI as the main identifier for the doc field semantic_identifier type string { indexing: summary | attribute } + # this is used when computing best matches based on the title of the document + # may not always match the `semantic_identifier` e.g. for Slack docs the + # `semantic_identifier` will be the channel name, but the `title` will be empty + field title type string { + indexing: summary | index + match { + gram + gram-size: 3 + } + index: enable-bm25 + } field section_continuation type bool { indexing: summary | attribute } @@ -70,7 +82,7 @@ schema danswer_chunk { } fieldset default { - fields: content + fields: content, title } rank-profile keyword_search inherits default { @@ -103,4 +115,11 @@ schema danswer_chunk { } match-features: closest(embeddings) } + + # used when searching from the admin UI for a specific doc to hide / boost + rank-profile admin_search inherits default { + first-phase { + expression: bm25(content) + (100 * bm25(title)) + } + } } diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/datastores/vespa/store.py index e6a4b73c4..689ae5c4a 100644 --- a/backend/danswer/datastores/vespa/store.py +++ b/backend/danswer/datastores/vespa/store.py @@ -38,6 +38,7 @@ from danswer.configs.constants import SECTION_CONTINUATION from danswer.configs.constants import SEMANTIC_IDENTIFIER from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE +from danswer.configs.constants import TITLE from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF from danswer.datastores.datastore_utils import get_uuid_from_chunk from danswer.datastores.interfaces import DocumentIndex @@ -166,6 +167,7 @@ def _index_vespa_chunk( SOURCE_TYPE: str(document.source.value), SOURCE_LINKS: json.dumps(chunk.source_links), SEMANTIC_IDENTIFIER: document.semantic_identifier, + TITLE: document.get_title_for_document_index(), SECTION_CONTINUATION: chunk.section_continuation, METADATA: json.dumps(document.metadata), EMBEDDINGS: embeddings_name_vector_map, @@ -264,7 +266,9 @@ def _index_vespa_chunks( return insertion_records -def _build_vespa_filters(filters: list[IndexFilter] | None) -> str: +def _build_vespa_filters( + filters: list[IndexFilter] | None, include_hidden: bool = False +) -> str: # NOTE: permissions filters are expected to be passed in directly via # the `filters` arg, which is why they are not considered explicitly here @@ -272,8 +276,9 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str: # via the `filters` arg. These are set either in the Web UI or in the Slack # listener - # ignore hidden docs - filter_str = f"!({HIDDEN}=true) and " + # usually ignore hidden docs unless explicitly requested. We may want to + # get hidden docs on the admin panel to allow for un-hiding + filter_str = f"!({HIDDEN}=true) and " if include_hidden else "" # Handle provided query filters if filters: @@ -389,6 +394,7 @@ class VespaIndex(DocumentIndex): f"{SEMANTIC_IDENTIFIER}, " f"{SECTION_CONTINUATION}, " f"{BOOST}, " + f"{HIDDEN}, " f"{METADATA} " f"{CONTENT_SUMMARY} " f"from {DOCUMENT_INDEX_NAME} where " @@ -604,3 +610,32 @@ class VespaIndex(DocumentIndex): } return _query_vespa(params) + + def admin_retrieval( + self, + query: str, + user_id: UUID | None, + filters: list[IndexFilter] | None, + num_to_retrieve: int = NUM_RETURNED_HITS, + ) -> list[InferenceChunk]: + vespa_where_clauses = _build_vespa_filters(filters) + yql = ( + VespaIndex.yql_base + + vespa_where_clauses + + '({grammar: "weakAnd"}userInput(@query) ' + # `({defaultIndex: "content_summary"}userInput(@query))` section is + # needed for highlighting while the N-gram highlighting is broken / + # not working as desired + + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' + + _build_vespa_limit(num_to_retrieve) + ) + + params: dict[str, str | int] = { + "yql": yql, + "query": query, + "hits": num_to_retrieve, + "num_to_rerank": 10 * num_to_retrieve, + "ranking.profile": "admin_search", + } + + return _query_vespa(params) diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py index f508b798c..a6eeb5b89 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/semantic_search.py @@ -49,6 +49,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc blurb=chunk.blurb, source_type=chunk.source_type, boost=chunk.boost, + hidden=chunk.hidden, score=chunk.score, match_highlights=chunk.match_highlights, ) diff --git a/backend/danswer/server/models.py b/backend/danswer/server/models.py index 0f5a349ee..aa20b4eeb 100644 --- a/backend/danswer/server/models.py +++ b/backend/danswer/server/models.py @@ -147,6 +147,10 @@ class SearchDoc(BaseModel): blurb: str source_type: str boost: int + # whether the document is hidden when doing a standard search + # since a standard search will never find a hidden doc, this can only ever + # be `True` when doing an admin search + hidden: bool score: float | None # Matched sections in the doc. Uses Vespa syntax e.g. TEXT # to specify that a set of words should be highlighted. For example: diff --git a/backend/danswer/server/search_backend.py b/backend/danswer/server/search_backend.py index 3b6a1b11b..2bb5e2c21 100644 --- a/backend/danswer/server/search_backend.py +++ b/backend/danswer/server/search_backend.py @@ -2,15 +2,20 @@ from collections.abc import Generator from fastapi import APIRouter from fastapi import Depends +from fastapi import HTTPException from fastapi.responses import StreamingResponse +from pydantic import BaseModel from sqlalchemy.orm import Session +from danswer.auth.users import current_admin_user from danswer.auth.users import current_user from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DISABLE_GENERATIVE_AI from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL from danswer.configs.constants import IGNORE_FOR_QA from danswer.datastores.document_index import get_default_document_index +from danswer.datastores.interfaces import IndexFilter +from danswer.datastores.vespa.store import VespaIndex from danswer.db.engine import get_session from danswer.db.feedback import create_doc_retrieval_feedback from danswer.db.feedback import create_query_event @@ -38,6 +43,7 @@ from danswer.server.models import QAResponse from danswer.server.models import QueryValidationResponse from danswer.server.models import QuestionRequest from danswer.server.models import RerankedRetrievalDocs +from danswer.server.models import SearchDoc from danswer.server.models import SearchFeedbackRequest from danswer.server.models import SearchResponse from danswer.server.utils import get_json_line @@ -49,6 +55,57 @@ logger = setup_logger() router = APIRouter() +"""Admin-only search endpoints""" + + +class AdminSearchRequest(BaseModel): + query: str + filters: list[IndexFilter] | None = None + + +class AdminSearchResponse(BaseModel): + documents: list[SearchDoc] + + +@router.post("/admin/search") +def admin_search( + question: AdminSearchRequest, + user: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), +) -> AdminSearchResponse: + query = question.query + filters = question.filters + logger.info(f"Received admin search query: {query}") + + user_id = None if user is None else user.id + user_acl_filters = build_access_filters_for_user(user, db_session) + final_filters = (filters or []) + user_acl_filters + document_index = get_default_document_index() + if not isinstance(document_index, VespaIndex): + raise HTTPException( + status_code=400, + detail="Cannot use admin-search when using a non-Vespa document index", + ) + + matching_chunks = document_index.admin_retrieval( + query=query, user_id=user_id, filters=final_filters + ) + + documents = chunks_to_search_docs(matching_chunks) + + # deduplicate documents by id + deduplicated_documents: list[SearchDoc] = [] + seen_documents: set[str] = set() + for document in documents: + if document.document_id not in seen_documents: + deduplicated_documents.append(document) + seen_documents.add(document.document_id) + return AdminSearchResponse(documents=deduplicated_documents) + + +"""Search endpoints for all""" + + @router.post("/search-intent") def get_search_type( question: QuestionRequest, _: User = Depends(current_user) diff --git a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py index 32af5ff4d..8b3984744 100644 --- a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py +++ b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py @@ -113,6 +113,7 @@ class TestQAPostprocessing(unittest.TestCase): semantic_identifier="anything", section_continuation=False, boost=0, + hidden=False, score=1, metadata={}, match_highlights=[], @@ -127,6 +128,7 @@ class TestQAPostprocessing(unittest.TestCase): semantic_identifier="whatever", section_continuation=False, boost=0, + hidden=False, score=1, metadata={}, match_highlights=[], diff --git a/web/src/app/admin/documents/ScoreEditor.tsx b/web/src/app/admin/documents/ScoreEditor.tsx new file mode 100644 index 000000000..c95df3a71 --- /dev/null +++ b/web/src/app/admin/documents/ScoreEditor.tsx @@ -0,0 +1,89 @@ +import { PopupSpec } from "@/components/admin/connectors/Popup"; +import { useState } from "react"; +import { updateBoost } from "./lib"; +import { CheckmarkIcon, EditIcon } from "@/components/icons/icons"; + +export const ScoreSection = ({ + documentId, + initialScore, + setPopup, + refresh, + consistentWidth = true, +}: { + documentId: string; + initialScore: number; + setPopup: (popupSpec: PopupSpec | null) => void; + refresh: () => void; + consistentWidth?: boolean; +}) => { + const [isOpen, setIsOpen] = useState(false); + const [score, setScore] = useState(initialScore.toString()); + + const onSubmit = async () => { + const numericScore = Number(score); + if (isNaN(numericScore)) { + setPopup({ + message: "Score must be a number", + type: "error", + }); + return; + } + + const errorMsg = await updateBoost(documentId, numericScore); + if (errorMsg) { + setPopup({ + message: errorMsg, + type: "error", + }); + } else { + setPopup({ + message: "Updated score!", + type: "success", + }); + refresh(); + setIsOpen(false); + } + }; + + if (isOpen) { + return ( +
+ { + setScore(e.target.value); + }} + onKeyDown={(e) => { + if (e.key === "Enter") { + onSubmit(); + } + if (e.key === "Escape") { + setIsOpen(false); + setScore(initialScore.toString()); + } + }} + className="border bg-slate-700 text-gray-200 border-gray-300 rounded py-1 px-3 w-16 h-5 my-auto" + /> +
+ +
+
+ ); + } + + return ( +
+
+
+
{initialScore}
+
+
setIsOpen(true)} + > + +
+
+
+ ); +}; diff --git a/web/src/app/admin/documents/explorer/lib.ts b/web/src/app/admin/documents/explorer/lib.ts new file mode 100644 index 000000000..e3b4ace26 --- /dev/null +++ b/web/src/app/admin/documents/explorer/lib.ts @@ -0,0 +1,12 @@ +export const adminSearch = async (query: string) => { + const response = await fetch("/api/admin/search", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query, + }), + }); + return response; +}; diff --git a/web/src/app/admin/documents/explorer/page.tsx b/web/src/app/admin/documents/explorer/page.tsx new file mode 100644 index 000000000..6f181d6bc --- /dev/null +++ b/web/src/app/admin/documents/explorer/page.tsx @@ -0,0 +1,195 @@ +"use client"; + +import { ZoomInIcon } from "@/components/icons/icons"; +import { adminSearch } from "./lib"; +import { MagnifyingGlass } from "@phosphor-icons/react"; +import { useState, useEffect } from "react"; +import { DanswerDocument } from "@/lib/search/interfaces"; +import { FiZap } from "react-icons/fi"; +import { getSourceIcon } from "@/components/source"; +import { buildDocumentSummaryDisplay } from "@/components/search/DocumentDisplay"; +import { CustomCheckbox } from "@/components/CustomCheckbox"; +import { updateHiddenStatus } from "../lib"; +import { PopupSpec, usePopup } from "@/components/admin/connectors/Popup"; +import { getErrorMsg } from "@/lib/fetchUtils"; +import { ScoreSection } from "../ScoreEditor"; +import { useRouter } from "next/navigation"; + +const DocumentDisplay = ({ + document, + refresh, + setPopup, +}: { + document: DanswerDocument; + refresh: () => void; + setPopup: (popupSpec: PopupSpec | null) => void; +}) => { + return ( +
+
+ + {getSourceIcon(document.source_type, 22)} +

+ {document.semantic_identifier || document.document_id} +

+
+
+
+
+

Boost:

+ +
+
{ + const response = await updateHiddenStatus( + document.document_id, + !document.hidden + ); + if (response.ok) { + refresh(); + } else { + setPopup({ + type: "error", + message: `Failed to update document - ${getErrorMsg( + response + )}}`, + }); + } + }} + className="px-1 py-0.5 bg-gray-700 hover:bg-gray-600 rounded flex cursor-pointer select-none" + > +
+ {document.hidden ? ( +
Hidden
+ ) : ( + "Visible" + )} +
+
+ +
+
+
+

+ {buildDocumentSummaryDisplay(document.match_highlights, document.blurb)} +

+
+ ); +}; + +const Main = ({ + initialSearchValue, +}: { + initialSearchValue: string | undefined; +}) => { + const router = useRouter(); + const { popup, setPopup } = usePopup(); + + const [query, setQuery] = useState(initialSearchValue || ""); + const [timeoutId, setTimeoutId] = useState(null); + const [results, setResults] = useState([]); + + const onSearch = async (query: string) => { + const results = await adminSearch(query); + if (results.ok) { + setResults((await results.json()).documents); + } + setTimeoutId(null); + }; + + useEffect(() => { + if (timeoutId !== null) { + clearTimeout(timeoutId); + } + + if (query && query.trim() !== "") { + router.replace( + `/admin/documents/explorer?query=${encodeURIComponent(query)}` + ); + + const timeoutId = window.setTimeout(() => onSearch(query), 300); + setTimeoutId(timeoutId); + } else { + setResults([]); + } + }, [query]); + + return ( +
+ {popup} +
+
+ +