Add hiding of documents to feedback page (#585)

This commit is contained in:
Chris Weaver
2023-10-17 20:06:12 -07:00
committed by GitHub
parent e73739547a
commit 5da81a3d0d
16 changed files with 422 additions and 207 deletions

View File

@@ -24,6 +24,7 @@ PUBLIC_DOC_PAT = "PUBLIC"
PUBLIC_DOCUMENT_SET = "__PUBLIC"
QUOTE = "quote"
BOOST = "boost"
HIDDEN = "hidden"
SCORE = "score"
ID_SEPARATOR = ":;:"
DEFAULT_BOOST = 0

View File

@@ -36,6 +36,7 @@ class UpdateRequest:
access: DocumentAccess | None = None
document_sets: set[str] | None = None
boost: float | None = None
hidden: bool | None = None
class Verifiable(abc.ABC):

View File

@@ -47,6 +47,9 @@ schema danswer_chunk {
field boost type float {
indexing: summary | attribute
}
field hidden type bool {
indexing: summary | attribute
}
field metadata type string {
indexing: summary | attribute
}

View File

@@ -30,6 +30,7 @@ from danswer.configs.constants import DEFAULT_BOOST
from danswer.configs.constants import DOCUMENT_ID
from danswer.configs.constants import DOCUMENT_SETS
from danswer.configs.constants import EMBEDDINGS
from danswer.configs.constants import HIDDEN
from danswer.configs.constants import MATCH_HIGHLIGHTS
from danswer.configs.constants import METADATA
from danswer.configs.constants import SCORE
@@ -271,8 +272,10 @@ def _build_vespa_filters(filters: list[IndexFilter] | None) -> str:
# via the `filters` arg. These are set either in the Web UI or in the Slack
# listener
# ignore hidden docs
filter_str = f"!({HIDDEN}=true) and "
# Handle provided query filters
filter_str = ""
if filters:
for filter_dict in filters:
valid_filters = {
@@ -424,16 +427,26 @@ class VespaIndex(DocumentIndex):
batch_size: int = _BATCH_SIZE,
) -> None:
"""Runs a batch of updates in parallel via the ThreadPoolExecutor."""
def _update_chunk(update: _VespaUpdateRequest) -> Response:
update_body = json.dumps(update.update_request)
logger.debug(
f"Updating with request to {update.url} with body {update_body}"
)
return requests.put(
update.url,
headers={"Content-Type": "application/json"},
data=update_body,
)
with concurrent.futures.ThreadPoolExecutor(
max_workers=_NUM_THREADS
) as executor:
for update_batch in batch_generator(updates, batch_size):
future_to_document_id = {
executor.submit(
requests.put,
update.url,
headers={"Content-Type": "application/json"},
data=json.dumps(update.update_request),
_update_chunk,
update,
): update.document_id
for update in update_batch
}
@@ -451,14 +464,6 @@ class VespaIndex(DocumentIndex):
processed_updates_requests: list[_VespaUpdateRequest] = []
for update_request in update_requests:
if (
update_request.boost is None
and update_request.access is None
and update_request.document_sets is None
):
logger.error("Update request received but nothing to update")
continue
update_dict: dict[str, dict] = {"fields": {}}
if update_request.boost is not None:
update_dict["fields"][BOOST] = {"assign": update_request.boost}
@@ -474,6 +479,12 @@ class VespaIndex(DocumentIndex):
acl_entry: 1 for acl_entry in update_request.access.to_acl()
}
}
if update_request.hidden is not None:
update_dict["fields"][HIDDEN] = {"assign": update_request.hidden}
if not update_dict["fields"]:
logger.error("Update request received but nothing to update")
continue
for document_id in update_request.document_ids:
for doc_chunk_id in _get_vespa_chunk_ids_by_document_id(document_id):

View File

@@ -46,7 +46,11 @@ def fetch_docs_ranked_by_boost(
db_session: Session, ascending: bool = False, limit: int = 100
) -> list[DbDocument]:
order_func = asc if ascending else desc
stmt = select(DbDocument).order_by(order_func(DbDocument.boost)).limit(limit)
stmt = (
select(DbDocument)
.order_by(order_func(DbDocument.boost), order_func(DbDocument.semantic_id))
.limit(limit)
)
result = db_session.execute(stmt)
doc_list = result.scalars().all()
@@ -71,6 +75,24 @@ def update_document_boost(db_session: Session, document_id: str, boost: int) ->
db_session.commit()
def update_document_hidden(db_session: Session, document_id: str, hidden: bool) -> None:
stmt = select(DbDocument).where(DbDocument.id == document_id)
result = db_session.execute(stmt).scalar_one_or_none()
if result is None:
raise ValueError(f"No document found with ID: '{document_id}'")
result.hidden = hidden
update = UpdateRequest(
document_ids=[document_id],
hidden=hidden,
)
get_default_document_index().update([update])
db_session.commit()
def create_query_event(
query: str,
selected_flow: SearchType | None,

View File

@@ -54,6 +54,7 @@ from danswer.db.document import get_document_cnts_for_cc_pairs
from danswer.db.engine import get_session
from danswer.db.feedback import fetch_docs_ranked_by_boost
from danswer.db.feedback import update_document_boost
from danswer.db.feedback import update_document_hidden
from danswer.db.index_attempt import create_index_attempt
from danswer.db.index_attempt import get_latest_index_attempts
from danswer.db.models import User
@@ -78,6 +79,7 @@ from danswer.server.models import GDriveCallback
from danswer.server.models import GoogleAppCredentials
from danswer.server.models import GoogleServiceAccountCredentialRequest
from danswer.server.models import GoogleServiceAccountKey
from danswer.server.models import HiddenUpdateRequest
from danswer.server.models import IndexAttemptSnapshot
from danswer.server.models import ObjectCreationIdResponse
from danswer.server.models import RunConnectorRequest
@@ -133,6 +135,22 @@ def document_boost_update(
raise HTTPException(status_code=400, detail=str(e))
@router.post("/admin/doc-hidden")
def document_hidden_update(
hidden_update: HiddenUpdateRequest,
_: User | None = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> None:
try:
update_document_hidden(
db_session=db_session,
document_id=hidden_update.document_id,
hidden=hidden_update.hidden,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
@router.get("/admin/connector/google-drive/app-credential")
def check_google_app_credentials_exist(
_: User = Depends(current_admin_user),

View File

@@ -135,6 +135,11 @@ class BoostUpdateRequest(BaseModel):
boost: int
class HiddenUpdateRequest(BaseModel):
document_id: str
hidden: bool
class SearchDoc(BaseModel):
document_id: str
semantic_identifier: str