Document explorer admin page (#590)

This commit is contained in:
Chris Weaver 2023-10-18 18:41:39 -07:00 committed by GitHub
parent a5d2759fbc
commit 1bd76f528f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 447 additions and 89 deletions

View File

@ -93,6 +93,7 @@ class InferenceChunk(BaseChunk):
semantic_identifier: str
boost: int
score: float | None
hidden: bool
metadata: dict[str, Any]
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:

View File

@ -8,6 +8,7 @@ SOURCE_TYPE = "source_type"
SOURCE_LINKS = "source_links"
SOURCE_LINK = "link"
SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
ALLOWED_USERS = "allowed_users"

View File

@ -24,8 +24,14 @@ class Document:
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
sections: list[Section]
source: DocumentSource
semantic_identifier: str
semantic_identifier: str # displayed in the UI as the main identifier for the doc
metadata: dict[str, Any]
# `title` is used when computing best matches for a query
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
title: str | None = None
def get_title_for_document_index(self) -> str:
return self.semantic_identifier if self.title is None else self.title
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a document"""

View File

@ -148,6 +148,7 @@ def thread_to_doc(
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
metadata={},
)
@ -302,6 +303,7 @@ class SlackLoadConnector(LoadConnector):
],
source=matching_doc.source,
semantic_identifier=matching_doc.semantic_identifier,
title="", # slack docs don't really have a "title"
metadata=matching_doc.metadata,
)
@ -319,6 +321,7 @@ class SlackLoadConnector(LoadConnector):
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
metadata={},
)

View File

@ -37,9 +37,21 @@ schema danswer_chunk {
field source_links type string {
indexing: summary | attribute
}
# displayed in the UI as the main identifier for the doc
field semantic_identifier type string {
indexing: summary | attribute
}
# this is used when computing best matches based on the title of the document
# may not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index
match {
gram
gram-size: 3
}
index: enable-bm25
}
field section_continuation type bool {
indexing: summary | attribute
}
@ -70,7 +82,7 @@ schema danswer_chunk {
}
fieldset default {
fields: content
fields: content, title
}
rank-profile keyword_search inherits default {
@ -103,4 +115,11 @@ schema danswer_chunk {
}
match-features: closest(embeddings)
}
# used when searching from the admin UI for a specific doc to hide / boost
rank-profile admin_search inherits default {
first-phase {
expression: bm25(content) + (100 * bm25(title))
}
}
}

View File

@ -38,6 +38,7 @@ from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.datastores.datastore_utils import get_uuid_from_chunk
from danswer.datastores.interfaces import DocumentIndex
@ -166,6 +167,7 @@ def _index_vespa_chunk(
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: document.semantic_identifier,
TITLE: document.get_title_for_document_index(),
SECTION_CONTINUATION: chunk.section_continuation,
METADATA: json.dumps(document.metadata),
EMBEDDINGS: embeddings_name_vector_map,
@ -264,7 +266,9 @@ def _index_vespa_chunks(
return insertion_records
def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
def _build_vespa_filters(
filters: list[IndexFilter] | None, include_hidden: bool = False
) -> str:
# NOTE: permissions filters are expected to be passed in directly via
# the `filters` arg, which is why they are not considered explicitly here
@ -272,8 +276,9 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
# via the `filters` arg. These are set either in the Web UI or in the Slack
# listener
# ignore hidden docs
filter_str = f"!({HIDDEN}=true) and "
# usually ignore hidden docs unless explicitly requested. We may want to
# get hidden docs on the admin panel to allow for un-hiding
filter_str = f"!({HIDDEN}=true) and " if include_hidden else ""
# Handle provided query filters
if filters:
@ -389,6 +394,7 @@ class VespaIndex(DocumentIndex):
f"{SEMANTIC_IDENTIFIER}, "
f"{SECTION_CONTINUATION}, "
f"{BOOST}, "
f"{HIDDEN}, "
f"{METADATA} "
f"{CONTENT_SUMMARY} "
f"from {DOCUMENT_INDEX_NAME} where "
@ -604,3 +610,32 @@ class VespaIndex(DocumentIndex):
}
return _query_vespa(params)
def admin_retrieval(
self,
query: str,
user_id: UUID | None,
filters: list[IndexFilter] | None,
num_to_retrieve: int = NUM_RETURNED_HITS,
) -> list[InferenceChunk]:
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
VespaIndex.yql_base
+ vespa_where_clauses
+ '({grammar: "weakAnd"}userInput(@query) '
# `({defaultIndex: "content_summary"}userInput(@query))` section is
# needed for highlighting while the N-gram highlighting is broken /
# not working as desired
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+ _build_vespa_limit(num_to_retrieve)
)
params: dict[str, str | int] = {
"yql": yql,
"query": query,
"hits": num_to_retrieve,
"num_to_rerank": 10 * num_to_retrieve,
"ranking.profile": "admin_search",
}
return _query_vespa(params)

View File

@ -49,6 +49,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
score=chunk.score,
match_highlights=chunk.match_highlights,
)

View File

@ -147,6 +147,10 @@ class SearchDoc(BaseModel):
blurb: str
source_type: str
boost: int
# whether the document is hidden when doing a standard search
# since a standard search will never find a hidden doc, this can only ever
# be `True` when doing an admin search
hidden: bool
score: float | None
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:

View File

@ -2,15 +2,20 @@ from collections.abc import Generator
from fastapi import APIRouter
from fastapi import Depends
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from sqlalchemy.orm import Session
from danswer.auth.users import current_admin_user
from danswer.auth.users import current_user
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.datastores.document_index import get_default_document_index
from danswer.datastores.interfaces import IndexFilter
from danswer.datastores.vespa.store import VespaIndex
from danswer.db.engine import get_session
from danswer.db.feedback import create_doc_retrieval_feedback
from danswer.db.feedback import create_query_event
@ -38,6 +43,7 @@ from danswer.server.models import QAResponse
from danswer.server.models import QueryValidationResponse
from danswer.server.models import QuestionRequest
from danswer.server.models import RerankedRetrievalDocs
from danswer.server.models import SearchDoc
from danswer.server.models import SearchFeedbackRequest
from danswer.server.models import SearchResponse
from danswer.server.utils import get_json_line
@ -49,6 +55,57 @@ logger = setup_logger()
router = APIRouter()
"""Admin-only search endpoints"""
class AdminSearchRequest(BaseModel):
query: str
filters: list[IndexFilter] | None = None
class AdminSearchResponse(BaseModel):
documents: list[SearchDoc]
@router.post("/admin/search")
def admin_search(
question: AdminSearchRequest,
user: User | None = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> AdminSearchResponse:
query = question.query
filters = question.filters
logger.info(f"Received admin search query: {query}")
user_id = None if user is None else user.id
user_acl_filters = build_access_filters_for_user(user, db_session)
final_filters = (filters or []) + user_acl_filters
document_index = get_default_document_index()
if not isinstance(document_index, VespaIndex):
raise HTTPException(
status_code=400,
detail="Cannot use admin-search when using a non-Vespa document index",
)
matching_chunks = document_index.admin_retrieval(
query=query, user_id=user_id, filters=final_filters
)
documents = chunks_to_search_docs(matching_chunks)
# deduplicate documents by id
deduplicated_documents: list[SearchDoc] = []
seen_documents: set[str] = set()
for document in documents:
if document.document_id not in seen_documents:
deduplicated_documents.append(document)
seen_documents.add(document.document_id)
return AdminSearchResponse(documents=deduplicated_documents)
"""Search endpoints for all"""
@router.post("/search-intent")
def get_search_type(
question: QuestionRequest, _: User = Depends(current_user)

View File

@ -113,6 +113,7 @@ class TestQAPostprocessing(unittest.TestCase):
semantic_identifier="anything",
section_continuation=False,
boost=0,
hidden=False,
score=1,
metadata={},
match_highlights=[],
@ -127,6 +128,7 @@ class TestQAPostprocessing(unittest.TestCase):
semantic_identifier="whatever",
section_continuation=False,
boost=0,
hidden=False,
score=1,
metadata={},
match_highlights=[],

View File

@ -0,0 +1,89 @@
import { PopupSpec } from "@/components/admin/connectors/Popup";
import { useState } from "react";
import { updateBoost } from "./lib";
import { CheckmarkIcon, EditIcon } from "@/components/icons/icons";
export const ScoreSection = ({
documentId,
initialScore,
setPopup,
refresh,
consistentWidth = true,
}: {
documentId: string;
initialScore: number;
setPopup: (popupSpec: PopupSpec | null) => void;
refresh: () => void;
consistentWidth?: boolean;
}) => {
const [isOpen, setIsOpen] = useState(false);
const [score, setScore] = useState(initialScore.toString());
const onSubmit = async () => {
const numericScore = Number(score);
if (isNaN(numericScore)) {
setPopup({
message: "Score must be a number",
type: "error",
});
return;
}
const errorMsg = await updateBoost(documentId, numericScore);
if (errorMsg) {
setPopup({
message: errorMsg,
type: "error",
});
} else {
setPopup({
message: "Updated score!",
type: "success",
});
refresh();
setIsOpen(false);
}
};
if (isOpen) {
return (
<div className="my-auto h-full flex">
<input
value={score}
onChange={(e) => {
setScore(e.target.value);
}}
onKeyDown={(e) => {
if (e.key === "Enter") {
onSubmit();
}
if (e.key === "Escape") {
setIsOpen(false);
setScore(initialScore.toString());
}
}}
className="border bg-slate-700 text-gray-200 border-gray-300 rounded py-1 px-3 w-16 h-5 my-auto"
/>
<div onClick={onSubmit} className="cursor-pointer my-auto ml-2">
<CheckmarkIcon size={16} className="text-green-700" />
</div>
</div>
);
}
return (
<div className="h-full flex flex-col">
<div className="flex my-auto">
<div className={"flex" + (consistentWidth && " w-6")}>
<div className="ml-auto my-auto">{initialScore}</div>
</div>
<div
className="cursor-pointer ml-2 my-auto"
onClick={() => setIsOpen(true)}
>
<EditIcon size={16} />
</div>
</div>
</div>
);
};

View File

@ -0,0 +1,12 @@
export const adminSearch = async (query: string) => {
const response = await fetch("/api/admin/search", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
query,
}),
});
return response;
};

View File

@ -0,0 +1,195 @@
"use client";
import { ZoomInIcon } from "@/components/icons/icons";
import { adminSearch } from "./lib";
import { MagnifyingGlass } from "@phosphor-icons/react";
import { useState, useEffect } from "react";
import { DanswerDocument } from "@/lib/search/interfaces";
import { FiZap } from "react-icons/fi";
import { getSourceIcon } from "@/components/source";
import { buildDocumentSummaryDisplay } from "@/components/search/DocumentDisplay";
import { CustomCheckbox } from "@/components/CustomCheckbox";
import { updateHiddenStatus } from "../lib";
import { PopupSpec, usePopup } from "@/components/admin/connectors/Popup";
import { getErrorMsg } from "@/lib/fetchUtils";
import { ScoreSection } from "../ScoreEditor";
import { useRouter } from "next/navigation";
const DocumentDisplay = ({
document,
refresh,
setPopup,
}: {
document: DanswerDocument;
refresh: () => void;
setPopup: (popupSpec: PopupSpec | null) => void;
}) => {
return (
<div
key={document.document_id}
className="text-sm border-b border-gray-800 mb-3"
>
<div className="flex relative">
<a
className={
"rounded-lg flex font-bold " +
(document.link ? "" : "pointer-events-none")
}
href={document.link}
target="_blank"
rel="noopener noreferrer"
>
{getSourceIcon(document.source_type, 22)}
<p className="truncate break-all ml-2 my-auto text-base">
{document.semantic_identifier || document.document_id}
</p>
</a>
</div>
<div className="flex flex-wrap gap-x-2 mt-1 text-xs">
<div className="px-1 py-0.5 bg-gray-700 rounded flex">
<p className="mr-1 my-auto">Boost:</p>
<ScoreSection
documentId={document.document_id}
initialScore={document.boost}
setPopup={setPopup}
refresh={refresh}
consistentWidth={false}
/>
</div>
<div
onClick={async () => {
const response = await updateHiddenStatus(
document.document_id,
!document.hidden
);
if (response.ok) {
refresh();
} else {
setPopup({
type: "error",
message: `Failed to update document - ${getErrorMsg(
response
)}}`,
});
}
}}
className="px-1 py-0.5 bg-gray-700 hover:bg-gray-600 rounded flex cursor-pointer select-none"
>
<div className="my-auto">
{document.hidden ? (
<div className="text-red-500">Hidden</div>
) : (
"Visible"
)}
</div>
<div className="ml-1 my-auto">
<CustomCheckbox checked={!document.hidden} />
</div>
</div>
</div>
<p className="pl-1 pt-2 pb-3 text-gray-200 break-words">
{buildDocumentSummaryDisplay(document.match_highlights, document.blurb)}
</p>
</div>
);
};
const Main = ({
initialSearchValue,
}: {
initialSearchValue: string | undefined;
}) => {
const router = useRouter();
const { popup, setPopup } = usePopup();
const [query, setQuery] = useState(initialSearchValue || "");
const [timeoutId, setTimeoutId] = useState<number | null>(null);
const [results, setResults] = useState<DanswerDocument[]>([]);
const onSearch = async (query: string) => {
const results = await adminSearch(query);
if (results.ok) {
setResults((await results.json()).documents);
}
setTimeoutId(null);
};
useEffect(() => {
if (timeoutId !== null) {
clearTimeout(timeoutId);
}
if (query && query.trim() !== "") {
router.replace(
`/admin/documents/explorer?query=${encodeURIComponent(query)}`
);
const timeoutId = window.setTimeout(() => onSearch(query), 300);
setTimeoutId(timeoutId);
} else {
setResults([]);
}
}, [query]);
return (
<div>
{popup}
<div className="flex justify-center py-2">
<div className="flex items-center w-full border-2 border-gray-600 rounded px-4 py-2 focus-within:border-blue-500">
<MagnifyingGlass className="text-gray-400" />
<textarea
autoFocus
className="flex-grow ml-2 h-6 bg-transparent outline-none placeholder-gray-400 overflow-hidden whitespace-normal resize-none"
role="textarea"
aria-multiline
placeholder="Find documents based on title / content..."
value={query}
onChange={(e) => {
setQuery(e.target.value);
}}
suppressContentEditableWarning={true}
/>
</div>
</div>
{results.length > 0 && (
<div className="mt-3">
{results.map((document) => {
return (
<DocumentDisplay
key={document.document_id}
document={document}
refresh={() => onSearch(query)}
setPopup={setPopup}
/>
);
})}
</div>
)}
{!query && (
<div className="flex">
<FiZap className="my-auto mr-1 text-blue-400" /> Search for a document
above to modify it&apos;s boost or hide it from searches.
</div>
)}
</div>
);
};
const Page = ({
searchParams,
}: {
searchParams: { [key: string]: string };
}) => {
return (
<div className="mx-auto container">
<div className="border-solid border-gray-600 border-b pb-2 mb-3 flex">
<ZoomInIcon size={32} />
<h1 className="text-3xl font-bold pl-2">Document Explorer</h1>
</div>
<Main initialSearchValue={searchParams.query} />
</div>
);
};
export default Page;

View File

@ -3,13 +3,14 @@ import { PopupSpec, usePopup } from "@/components/admin/connectors/Popup";
import { useState } from "react";
import { PageSelector } from "@/components/PageSelector";
import { DocumentBoostStatus } from "@/lib/types";
import { updateBoost, updateHiddenStatus } from "./lib";
import { updateBoost, updateHiddenStatus } from "../lib";
import { CheckmarkIcon, EditIcon } from "@/components/icons/icons";
import { numToDisplay } from "./constants";
import { FiCheck, FiCheckSquare, FiEye, FiEyeOff, FiX } from "react-icons/fi";
import { FiEye, FiEyeOff } from "react-icons/fi";
import { getErrorMsg } from "@/lib/fetchUtils";
import { HoverPopup } from "@/components/HoverPopup";
import { CustomCheckbox } from "@/components/CustomCheckbox";
import { ScoreSection } from "../ScoreEditor";
const IsVisibleSection = ({
document,
@ -74,86 +75,6 @@ const IsVisibleSection = ({
);
};
const ScoreSection = ({
documentId,
initialScore,
setPopup,
refresh,
}: {
documentId: string;
initialScore: number;
setPopup: (popupSpec: PopupSpec | null) => void;
refresh: () => void;
}) => {
const [isOpen, setIsOpen] = useState(false);
const [score, setScore] = useState(initialScore.toString());
const onSubmit = async () => {
const numericScore = Number(score);
if (isNaN(numericScore)) {
setPopup({
message: "Score must be a number",
type: "error",
});
return;
}
const errorMsg = await updateBoost(documentId, numericScore);
if (errorMsg) {
setPopup({
message: errorMsg,
type: "error",
});
} else {
setPopup({
message: "Updated score!",
type: "success",
});
refresh();
setIsOpen(false);
}
};
if (isOpen) {
return (
<div className="m-auto flex">
<input
value={score}
onChange={(e) => {
setScore(e.target.value);
}}
onKeyDown={(e) => {
if (e.key === "Enter") {
onSubmit();
}
if (e.key === "Escape") {
setIsOpen(false);
setScore(initialScore.toString());
}
}}
className="border bg-slate-700 text-gray-200 border-gray-300 rounded py-1 px-3 w-16"
/>
<div onClick={onSubmit} className="cursor-pointer my-auto ml-2">
<CheckmarkIcon size={20} className="text-green-700" />
</div>
</div>
);
}
return (
<div className="h-full flex flex-col">
<div className="flex my-auto">
<div className="w-6 flex">
<div className="ml-auto">{initialScore}</div>
</div>
<div className="cursor-pointer ml-2" onClick={() => setIsOpen(true)}>
<EditIcon size={20} />
</div>
</div>
</div>
);
};
export const DocumentFeedbackTable = ({
documents,
refresh,

View File

@ -12,6 +12,7 @@ export const CustomCheckbox = ({
className="hidden"
checked={checked}
onChange={onChange}
readOnly={onChange ? false : true}
/>
<span className="relative">
<span

View File

@ -25,6 +25,7 @@ import {
Document360Icon,
GoogleSitesIcon,
GongIcon,
ZoomInIcon,
} from "@/components/icons/icons";
import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS";
import { redirect } from "next/navigation";
@ -272,6 +273,15 @@ export async function Layout({ children }: { children: React.ReactNode }) {
),
link: "/admin/documents/sets",
},
{
name: (
<div className="flex">
<ZoomInIcon size={18} />
<div className="ml-1">Explorer</div>
</div>
),
link: "/admin/documents/explorer",
},
{
name: (
<div className="flex">

View File

@ -4,7 +4,7 @@ import { getSourceIcon } from "../source";
import { useState } from "react";
import { PopupSpec } from "../admin/connectors/Popup";
const buildDocumentSummaryDisplay = (
export const buildDocumentSummaryDisplay = (
matchHighlights: string[],
blurb: string
) => {

View File

@ -1,4 +1,4 @@
import React, { useState, KeyboardEvent, ChangeEvent } from "react";
import React, { KeyboardEvent, ChangeEvent } from "react";
import { MagnifyingGlass } from "@phosphor-icons/react";
interface SearchBarProps {

View File

@ -32,6 +32,7 @@ export interface DanswerDocument {
blurb: string;
semantic_identifier: string | null;
boost: number;
hidden: boolean;
score: number;
match_highlights: string[];
}