Document explorer admin page (#590)

This commit is contained in:
Chris Weaver
2023-10-18 18:41:39 -07:00
committed by GitHub
parent a5d2759fbc
commit 1bd76f528f
20 changed files with 447 additions and 89 deletions

View File

@@ -93,6 +93,7 @@ class InferenceChunk(BaseChunk):
semantic_identifier: str
boost: int
score: float | None
hidden: bool
metadata: dict[str, Any]
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:

View File

@@ -8,6 +8,7 @@ SOURCE_TYPE = "source_type"
SOURCE_LINKS = "source_links"
SOURCE_LINK = "link"
SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
ALLOWED_USERS = "allowed_users"

View File

@@ -24,8 +24,14 @@ class Document:
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
sections: list[Section]
source: DocumentSource
semantic_identifier: str
semantic_identifier: str # displayed in the UI as the main identifier for the doc
metadata: dict[str, Any]
# `title` is used when computing best matches for a query
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
title: str | None = None
def get_title_for_document_index(self) -> str:
return self.semantic_identifier if self.title is None else self.title
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a document"""

View File

@@ -148,6 +148,7 @@ def thread_to_doc(
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
metadata={},
)
@@ -302,6 +303,7 @@ class SlackLoadConnector(LoadConnector):
],
source=matching_doc.source,
semantic_identifier=matching_doc.semantic_identifier,
title="", # slack docs don't really have a "title"
metadata=matching_doc.metadata,
)
@@ -319,6 +321,7 @@ class SlackLoadConnector(LoadConnector):
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
metadata={},
)

View File

@@ -37,9 +37,21 @@ schema danswer_chunk {
field source_links type string {
indexing: summary | attribute
}
# displayed in the UI as the main identifier for the doc
field semantic_identifier type string {
indexing: summary | attribute
}
# this is used when computing best matches based on the title of the document
# may not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index
match {
gram
gram-size: 3
}
index: enable-bm25
}
field section_continuation type bool {
indexing: summary | attribute
}
@@ -70,7 +82,7 @@ schema danswer_chunk {
}
fieldset default {
fields: content
fields: content, title
}
rank-profile keyword_search inherits default {
@@ -103,4 +115,11 @@ schema danswer_chunk {
}
match-features: closest(embeddings)
}
# used when searching from the admin UI for a specific doc to hide / boost
rank-profile admin_search inherits default {
first-phase {
expression: bm25(content) + (100 * bm25(title))
}
}
}

View File

@@ -38,6 +38,7 @@ from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.datastores.datastore_utils import get_uuid_from_chunk
from danswer.datastores.interfaces import DocumentIndex
@@ -166,6 +167,7 @@ def _index_vespa_chunk(
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: document.semantic_identifier,
TITLE: document.get_title_for_document_index(),
SECTION_CONTINUATION: chunk.section_continuation,
METADATA: json.dumps(document.metadata),
EMBEDDINGS: embeddings_name_vector_map,
@@ -264,7 +266,9 @@ def _index_vespa_chunks(
return insertion_records
def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
def _build_vespa_filters(
filters: list[IndexFilter] | None, include_hidden: bool = False
) -> str:
# NOTE: permissions filters are expected to be passed in directly via
# the `filters` arg, which is why they are not considered explicitly here
@@ -272,8 +276,9 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
# via the `filters` arg. These are set either in the Web UI or in the Slack
# listener
# ignore hidden docs
filter_str = f"!({HIDDEN}=true) and "
# usually ignore hidden docs unless explicitly requested. We may want to
# get hidden docs on the admin panel to allow for un-hiding
filter_str = f"!({HIDDEN}=true) and " if include_hidden else ""
# Handle provided query filters
if filters:
@@ -389,6 +394,7 @@ class VespaIndex(DocumentIndex):
f"{SEMANTIC_IDENTIFIER}, "
f"{SECTION_CONTINUATION}, "
f"{BOOST}, "
f"{HIDDEN}, "
f"{METADATA} "
f"{CONTENT_SUMMARY} "
f"from {DOCUMENT_INDEX_NAME} where "
@@ -604,3 +610,32 @@ class VespaIndex(DocumentIndex):
}
return _query_vespa(params)
def admin_retrieval(
self,
query: str,
user_id: UUID | None,
filters: list[IndexFilter] | None,
num_to_retrieve: int = NUM_RETURNED_HITS,
) -> list[InferenceChunk]:
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
VespaIndex.yql_base
+ vespa_where_clauses
+ '({grammar: "weakAnd"}userInput(@query) '
# `({defaultIndex: "content_summary"}userInput(@query))` section is
# needed for highlighting while the N-gram highlighting is broken /
# not working as desired
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
+ _build_vespa_limit(num_to_retrieve)
)
params: dict[str, str | int] = {
"yql": yql,
"query": query,
"hits": num_to_retrieve,
"num_to_rerank": 10 * num_to_retrieve,
"ranking.profile": "admin_search",
}
return _query_vespa(params)

View File

@@ -49,6 +49,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
blurb=chunk.blurb,
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
score=chunk.score,
match_highlights=chunk.match_highlights,
)

View File

@@ -147,6 +147,10 @@ class SearchDoc(BaseModel):
blurb: str
source_type: str
boost: int
# whether the document is hidden when doing a standard search
# since a standard search will never find a hidden doc, this can only ever
# be `True` when doing an admin search
hidden: bool
score: float | None
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:

View File

@@ -2,15 +2,20 @@ from collections.abc import Generator
from fastapi import APIRouter
from fastapi import Depends
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from sqlalchemy.orm import Session
from danswer.auth.users import current_admin_user
from danswer.auth.users import current_user
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.datastores.document_index import get_default_document_index
from danswer.datastores.interfaces import IndexFilter
from danswer.datastores.vespa.store import VespaIndex
from danswer.db.engine import get_session
from danswer.db.feedback import create_doc_retrieval_feedback
from danswer.db.feedback import create_query_event
@@ -38,6 +43,7 @@ from danswer.server.models import QAResponse
from danswer.server.models import QueryValidationResponse
from danswer.server.models import QuestionRequest
from danswer.server.models import RerankedRetrievalDocs
from danswer.server.models import SearchDoc
from danswer.server.models import SearchFeedbackRequest
from danswer.server.models import SearchResponse
from danswer.server.utils import get_json_line
@@ -49,6 +55,57 @@ logger = setup_logger()
router = APIRouter()
"""Admin-only search endpoints"""
class AdminSearchRequest(BaseModel):
query: str
filters: list[IndexFilter] | None = None
class AdminSearchResponse(BaseModel):
documents: list[SearchDoc]
@router.post("/admin/search")
def admin_search(
question: AdminSearchRequest,
user: User | None = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> AdminSearchResponse:
query = question.query
filters = question.filters
logger.info(f"Received admin search query: {query}")
user_id = None if user is None else user.id
user_acl_filters = build_access_filters_for_user(user, db_session)
final_filters = (filters or []) + user_acl_filters
document_index = get_default_document_index()
if not isinstance(document_index, VespaIndex):
raise HTTPException(
status_code=400,
detail="Cannot use admin-search when using a non-Vespa document index",
)
matching_chunks = document_index.admin_retrieval(
query=query, user_id=user_id, filters=final_filters
)
documents = chunks_to_search_docs(matching_chunks)
# deduplicate documents by id
deduplicated_documents: list[SearchDoc] = []
seen_documents: set[str] = set()
for document in documents:
if document.document_id not in seen_documents:
deduplicated_documents.append(document)
seen_documents.add(document.document_id)
return AdminSearchResponse(documents=deduplicated_documents)
"""Search endpoints for all"""
@router.post("/search-intent")
def get_search_type(
question: QuestionRequest, _: User = Depends(current_user)

View File

@@ -113,6 +113,7 @@ class TestQAPostprocessing(unittest.TestCase):
semantic_identifier="anything",
section_continuation=False,
boost=0,
hidden=False,
score=1,
metadata={},
match_highlights=[],
@@ -127,6 +128,7 @@ class TestQAPostprocessing(unittest.TestCase):
semantic_identifier="whatever",
section_continuation=False,
boost=0,
hidden=False,
score=1,
metadata={},
match_highlights=[],