mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-26 20:08:38 +02:00
Document explorer admin page (#590)
This commit is contained in:
@@ -93,6 +93,7 @@ class InferenceChunk(BaseChunk):
|
||||
semantic_identifier: str
|
||||
boost: int
|
||||
score: float | None
|
||||
hidden: bool
|
||||
metadata: dict[str, Any]
|
||||
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
|
@@ -8,6 +8,7 @@ SOURCE_TYPE = "source_type"
|
||||
SOURCE_LINKS = "source_links"
|
||||
SOURCE_LINK = "link"
|
||||
SEMANTIC_IDENTIFIER = "semantic_identifier"
|
||||
TITLE = "title"
|
||||
SECTION_CONTINUATION = "section_continuation"
|
||||
EMBEDDINGS = "embeddings"
|
||||
ALLOWED_USERS = "allowed_users"
|
||||
|
@@ -24,8 +24,14 @@ class Document:
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
sections: list[Section]
|
||||
source: DocumentSource
|
||||
semantic_identifier: str
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
metadata: dict[str, Any]
|
||||
# `title` is used when computing best matches for a query
|
||||
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
||||
title: str | None = None
|
||||
|
||||
def get_title_for_document_index(self) -> str:
|
||||
return self.semantic_identifier if self.title is None else self.title
|
||||
|
||||
def to_short_descriptor(self) -> str:
|
||||
"""Used when logging the identity of a document"""
|
||||
|
@@ -148,6 +148,7 @@ def thread_to_doc(
|
||||
],
|
||||
source=DocumentSource.SLACK,
|
||||
semantic_identifier=channel["name"],
|
||||
title="", # slack docs don't really have a "title"
|
||||
metadata={},
|
||||
)
|
||||
|
||||
@@ -302,6 +303,7 @@ class SlackLoadConnector(LoadConnector):
|
||||
],
|
||||
source=matching_doc.source,
|
||||
semantic_identifier=matching_doc.semantic_identifier,
|
||||
title="", # slack docs don't really have a "title"
|
||||
metadata=matching_doc.metadata,
|
||||
)
|
||||
|
||||
@@ -319,6 +321,7 @@ class SlackLoadConnector(LoadConnector):
|
||||
],
|
||||
source=DocumentSource.SLACK,
|
||||
semantic_identifier=channel["name"],
|
||||
title="", # slack docs don't really have a "title"
|
||||
metadata={},
|
||||
)
|
||||
|
||||
|
@@ -37,9 +37,21 @@ schema danswer_chunk {
|
||||
field source_links type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# displayed in the UI as the main identifier for the doc
|
||||
field semantic_identifier type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# this is used when computing best matches based on the title of the document
|
||||
# may not always match the `semantic_identifier` e.g. for Slack docs the
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
indexing: summary | index
|
||||
match {
|
||||
gram
|
||||
gram-size: 3
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
field section_continuation type bool {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
@@ -70,7 +82,7 @@ schema danswer_chunk {
|
||||
}
|
||||
|
||||
fieldset default {
|
||||
fields: content
|
||||
fields: content, title
|
||||
}
|
||||
|
||||
rank-profile keyword_search inherits default {
|
||||
@@ -103,4 +115,11 @@ schema danswer_chunk {
|
||||
}
|
||||
match-features: closest(embeddings)
|
||||
}
|
||||
|
||||
# used when searching from the admin UI for a specific doc to hide / boost
|
||||
rank-profile admin_search inherits default {
|
||||
first-phase {
|
||||
expression: bm25(content) + (100 * bm25(title))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -38,6 +38,7 @@ from danswer.configs.constants import SECTION_CONTINUATION
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||
from danswer.datastores.datastore_utils import get_uuid_from_chunk
|
||||
from danswer.datastores.interfaces import DocumentIndex
|
||||
@@ -166,6 +167,7 @@ def _index_vespa_chunk(
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||
SEMANTIC_IDENTIFIER: document.semantic_identifier,
|
||||
TITLE: document.get_title_for_document_index(),
|
||||
SECTION_CONTINUATION: chunk.section_continuation,
|
||||
METADATA: json.dumps(document.metadata),
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
@@ -264,7 +266,9 @@ def _index_vespa_chunks(
|
||||
return insertion_records
|
||||
|
||||
|
||||
def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
|
||||
def _build_vespa_filters(
|
||||
filters: list[IndexFilter] | None, include_hidden: bool = False
|
||||
) -> str:
|
||||
# NOTE: permissions filters are expected to be passed in directly via
|
||||
# the `filters` arg, which is why they are not considered explicitly here
|
||||
|
||||
@@ -272,8 +276,9 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
|
||||
# via the `filters` arg. These are set either in the Web UI or in the Slack
|
||||
# listener
|
||||
|
||||
# ignore hidden docs
|
||||
filter_str = f"!({HIDDEN}=true) and "
|
||||
# usually ignore hidden docs unless explicitly requested. We may want to
|
||||
# get hidden docs on the admin panel to allow for un-hiding
|
||||
filter_str = f"!({HIDDEN}=true) and " if include_hidden else ""
|
||||
|
||||
# Handle provided query filters
|
||||
if filters:
|
||||
@@ -389,6 +394,7 @@ class VespaIndex(DocumentIndex):
|
||||
f"{SEMANTIC_IDENTIFIER}, "
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{METADATA} "
|
||||
f"{CONTENT_SUMMARY} "
|
||||
f"from {DOCUMENT_INDEX_NAME} where "
|
||||
@@ -604,3 +610,32 @@ class VespaIndex(DocumentIndex):
|
||||
}
|
||||
|
||||
return _query_vespa(params)
|
||||
|
||||
def admin_retrieval(
|
||||
self,
|
||||
query: str,
|
||||
user_id: UUID | None,
|
||||
filters: list[IndexFilter] | None,
|
||||
num_to_retrieve: int = NUM_RETURNED_HITS,
|
||||
) -> list[InferenceChunk]:
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
+ vespa_where_clauses
|
||||
+ '({grammar: "weakAnd"}userInput(@query) '
|
||||
# `({defaultIndex: "content_summary"}userInput(@query))` section is
|
||||
# needed for highlighting while the N-gram highlighting is broken /
|
||||
# not working as desired
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
+ _build_vespa_limit(num_to_retrieve)
|
||||
)
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"hits": num_to_retrieve,
|
||||
"num_to_rerank": 10 * num_to_retrieve,
|
||||
"ranking.profile": "admin_search",
|
||||
}
|
||||
|
||||
return _query_vespa(params)
|
||||
|
@@ -49,6 +49,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
|
||||
blurb=chunk.blurb,
|
||||
source_type=chunk.source_type,
|
||||
boost=chunk.boost,
|
||||
hidden=chunk.hidden,
|
||||
score=chunk.score,
|
||||
match_highlights=chunk.match_highlights,
|
||||
)
|
||||
|
@@ -147,6 +147,10 @@ class SearchDoc(BaseModel):
|
||||
blurb: str
|
||||
source_type: str
|
||||
boost: int
|
||||
# whether the document is hidden when doing a standard search
|
||||
# since a standard search will never find a hidden doc, this can only ever
|
||||
# be `True` when doing an admin search
|
||||
hidden: bool
|
||||
score: float | None
|
||||
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
|
@@ -2,15 +2,20 @@ from collections.abc import Generator
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import Depends
|
||||
from fastapi import HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.auth.users import current_user
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import DISABLE_GENERATIVE_AI
|
||||
from danswer.configs.app_configs import NUM_DOCUMENT_TOKENS_FED_TO_GENERATIVE_MODEL
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.datastores.document_index import get_default_document_index
|
||||
from danswer.datastores.interfaces import IndexFilter
|
||||
from danswer.datastores.vespa.store import VespaIndex
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.feedback import create_doc_retrieval_feedback
|
||||
from danswer.db.feedback import create_query_event
|
||||
@@ -38,6 +43,7 @@ from danswer.server.models import QAResponse
|
||||
from danswer.server.models import QueryValidationResponse
|
||||
from danswer.server.models import QuestionRequest
|
||||
from danswer.server.models import RerankedRetrievalDocs
|
||||
from danswer.server.models import SearchDoc
|
||||
from danswer.server.models import SearchFeedbackRequest
|
||||
from danswer.server.models import SearchResponse
|
||||
from danswer.server.utils import get_json_line
|
||||
@@ -49,6 +55,57 @@ logger = setup_logger()
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
"""Admin-only search endpoints"""
|
||||
|
||||
|
||||
class AdminSearchRequest(BaseModel):
|
||||
query: str
|
||||
filters: list[IndexFilter] | None = None
|
||||
|
||||
|
||||
class AdminSearchResponse(BaseModel):
|
||||
documents: list[SearchDoc]
|
||||
|
||||
|
||||
@router.post("/admin/search")
|
||||
def admin_search(
|
||||
question: AdminSearchRequest,
|
||||
user: User | None = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> AdminSearchResponse:
|
||||
query = question.query
|
||||
filters = question.filters
|
||||
logger.info(f"Received admin search query: {query}")
|
||||
|
||||
user_id = None if user is None else user.id
|
||||
user_acl_filters = build_access_filters_for_user(user, db_session)
|
||||
final_filters = (filters or []) + user_acl_filters
|
||||
document_index = get_default_document_index()
|
||||
if not isinstance(document_index, VespaIndex):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Cannot use admin-search when using a non-Vespa document index",
|
||||
)
|
||||
|
||||
matching_chunks = document_index.admin_retrieval(
|
||||
query=query, user_id=user_id, filters=final_filters
|
||||
)
|
||||
|
||||
documents = chunks_to_search_docs(matching_chunks)
|
||||
|
||||
# deduplicate documents by id
|
||||
deduplicated_documents: list[SearchDoc] = []
|
||||
seen_documents: set[str] = set()
|
||||
for document in documents:
|
||||
if document.document_id not in seen_documents:
|
||||
deduplicated_documents.append(document)
|
||||
seen_documents.add(document.document_id)
|
||||
return AdminSearchResponse(documents=deduplicated_documents)
|
||||
|
||||
|
||||
"""Search endpoints for all"""
|
||||
|
||||
|
||||
@router.post("/search-intent")
|
||||
def get_search_type(
|
||||
question: QuestionRequest, _: User = Depends(current_user)
|
||||
|
@@ -113,6 +113,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
semantic_identifier="anything",
|
||||
section_continuation=False,
|
||||
boost=0,
|
||||
hidden=False,
|
||||
score=1,
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
@@ -127,6 +128,7 @@ class TestQAPostprocessing(unittest.TestCase):
|
||||
semantic_identifier="whatever",
|
||||
section_continuation=False,
|
||||
boost=0,
|
||||
hidden=False,
|
||||
score=1,
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
|
Reference in New Issue
Block a user