mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-22 05:50:57 +02:00
Revert "More efficient Vespa indexing (#3552)"
This reverts commit 27832167819f063bc2f4436765ff27f0ccba2b64.
This commit is contained in:
parent
9b65c23a7e
commit
3eb72e5c1d
@ -359,6 +359,7 @@ class WebConnector(LoadConnector):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
id=current_url,
|
id=current_url,
|
||||||
|
@ -504,7 +504,6 @@ class Document(Base):
|
|||||||
last_synced: Mapped[datetime.datetime | None] = mapped_column(
|
last_synced: Mapped[datetime.datetime | None] = mapped_column(
|
||||||
DateTime(timezone=True), nullable=True, index=True
|
DateTime(timezone=True), nullable=True, index=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# The following are not attached to User because the account/email may not be known
|
# The following are not attached to User because the account/email may not be known
|
||||||
# within Onyx
|
# within Onyx
|
||||||
# Something like the document creator
|
# Something like the document creator
|
||||||
|
@ -148,6 +148,7 @@ class Indexable(abc.ABC):
|
|||||||
def index(
|
def index(
|
||||||
self,
|
self,
|
||||||
chunks: list[DocMetadataAwareIndexChunk],
|
chunks: list[DocMetadataAwareIndexChunk],
|
||||||
|
fresh_index: bool = False,
|
||||||
) -> set[DocumentInsertionRecord]:
|
) -> set[DocumentInsertionRecord]:
|
||||||
"""
|
"""
|
||||||
Takes a list of document chunks and indexes them in the document index
|
Takes a list of document chunks and indexes them in the document index
|
||||||
@ -165,9 +166,15 @@ class Indexable(abc.ABC):
|
|||||||
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
|
||||||
it is done automatically outside of this code.
|
it is done automatically outside of this code.
|
||||||
|
|
||||||
|
NOTE: The fresh_index parameter, when set to True, assumes no documents have been previously
|
||||||
|
indexed for the given index/tenant. This can be used to optimize the indexing process for
|
||||||
|
new or empty indices.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
- chunks: Document chunks with all of the information needed for indexing to the document
|
- chunks: Document chunks with all of the information needed for indexing to the document
|
||||||
index.
|
index.
|
||||||
|
- fresh_index: Boolean indicating whether this is a fresh index with no existing documents.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of document ids which map to unique documents and are used for deduping chunks
|
List of document ids which map to unique documents and are used for deduping chunks
|
||||||
when updating, as well as if the document is newly indexed or already existed and
|
when updating, as well as if the document is newly indexed or already existed and
|
||||||
|
@ -10,9 +10,6 @@ schema DANSWER_CHUNK_NAME {
|
|||||||
field chunk_id type int {
|
field chunk_id type int {
|
||||||
indexing: summary | attribute
|
indexing: summary | attribute
|
||||||
}
|
}
|
||||||
field current_index_time type int {
|
|
||||||
indexing: summary | attribute
|
|
||||||
}
|
|
||||||
# Displayed in the UI as the main identifier for the doc
|
# Displayed in the UI as the main identifier for the doc
|
||||||
field semantic_identifier type string {
|
field semantic_identifier type string {
|
||||||
indexing: summary | attribute
|
indexing: summary | attribute
|
||||||
|
@ -42,15 +42,12 @@ from onyx.document_index.vespa.deletion import delete_vespa_docs
|
|||||||
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
|
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
|
||||||
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
|
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
|
||||||
from onyx.document_index.vespa.indexing_utils import (
|
from onyx.document_index.vespa.indexing_utils import (
|
||||||
find_existing_docs_in_vespa_by_doc_id,
|
get_existing_documents_from_chunks,
|
||||||
)
|
)
|
||||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||||
from onyx.document_index.vespa.shared_utils.utils import (
|
from onyx.document_index.vespa.shared_utils.utils import (
|
||||||
replace_invalid_doc_id_characters,
|
replace_invalid_doc_id_characters,
|
||||||
)
|
)
|
||||||
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
|
||||||
build_deletion_selection_query,
|
|
||||||
)
|
|
||||||
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
from onyx.document_index.vespa.shared_utils.vespa_request_builders import (
|
||||||
build_vespa_filters,
|
build_vespa_filters,
|
||||||
)
|
)
|
||||||
@ -310,35 +307,47 @@ class VespaIndex(DocumentIndex):
|
|||||||
def index(
|
def index(
|
||||||
self,
|
self,
|
||||||
chunks: list[DocMetadataAwareIndexChunk],
|
chunks: list[DocMetadataAwareIndexChunk],
|
||||||
|
fresh_index: bool = False,
|
||||||
) -> set[DocumentInsertionRecord]:
|
) -> set[DocumentInsertionRecord]:
|
||||||
"""
|
"""Receive a list of chunks from a batch of documents and index the chunks into Vespa along
|
||||||
Index a list of chunks into Vespa. We rely on 'current_index_time'
|
with updating the associated permissions. Assumes that a document will not be split into
|
||||||
to keep track of when each chunk was added/updated in the index. We also raise a ValueError
|
multiple chunk batches calling this function multiple times, otherwise only the last set of
|
||||||
if any chunk is missing a 'current_index_time' timestamp.
|
chunks will be kept"""
|
||||||
"""
|
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
||||||
|
|
||||||
# Clean chunks if needed (remove invalid chars, etc.)
|
|
||||||
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
|
||||||
|
|
||||||
# We will store the set of doc_ids that previously existed in Vespa
|
existing_docs: set[str] = set()
|
||||||
doc_ids_to_current_index_time = {
|
|
||||||
chunk.source_document.id: chunk.current_index_time
|
|
||||||
for chunk in cleaned_chunks
|
|
||||||
}
|
|
||||||
existing_doc_ids = set()
|
|
||||||
|
|
||||||
with get_vespa_http_client() as http_client, concurrent.futures.ThreadPoolExecutor(
|
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||||
max_workers=NUM_THREADS
|
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||||
) as executor:
|
with (
|
||||||
# a) Find which docs already exist in Vespa
|
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
|
||||||
existing_doc_ids = find_existing_docs_in_vespa_by_doc_id(
|
get_vespa_http_client() as http_client,
|
||||||
doc_ids=list(doc_ids_to_current_index_time.keys()),
|
):
|
||||||
|
if not fresh_index:
|
||||||
|
# Check for existing documents, existing documents need to have all of their chunks deleted
|
||||||
|
# prior to indexing as the document size (num chunks) may have shrunk
|
||||||
|
first_chunks = [
|
||||||
|
chunk for chunk in cleaned_chunks if chunk.chunk_id == 0
|
||||||
|
]
|
||||||
|
for chunk_batch in batch_generator(first_chunks, BATCH_SIZE):
|
||||||
|
existing_docs.update(
|
||||||
|
get_existing_documents_from_chunks(
|
||||||
|
chunks=chunk_batch,
|
||||||
|
index_name=self.index_name,
|
||||||
|
http_client=http_client,
|
||||||
|
executor=executor,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc_id_batch in batch_generator(existing_docs, BATCH_SIZE):
|
||||||
|
delete_vespa_docs(
|
||||||
|
document_ids=doc_id_batch,
|
||||||
index_name=self.index_name,
|
index_name=self.index_name,
|
||||||
http_client=http_client,
|
http_client=http_client,
|
||||||
executor=executor,
|
executor=executor,
|
||||||
)
|
)
|
||||||
|
|
||||||
# b) Feed new/updated chunks in batches
|
|
||||||
for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
|
for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
|
||||||
batch_index_vespa_chunks(
|
batch_index_vespa_chunks(
|
||||||
chunks=chunk_batch,
|
chunks=chunk_batch,
|
||||||
@ -348,34 +357,14 @@ class VespaIndex(DocumentIndex):
|
|||||||
executor=executor,
|
executor=executor,
|
||||||
)
|
)
|
||||||
|
|
||||||
# c) Remove chunks with using versioning scheme 'current_index_time'
|
all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
|
||||||
for doc_id in existing_doc_ids:
|
|
||||||
version_cutoff = int(doc_ids_to_current_index_time[doc_id].timestamp())
|
|
||||||
query_str = build_deletion_selection_query(
|
|
||||||
doc_id=doc_id,
|
|
||||||
version_cutoff=version_cutoff,
|
|
||||||
doc_type=self.index_name,
|
|
||||||
)
|
|
||||||
delete_url = (
|
|
||||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/"
|
|
||||||
f"?{query_str}&cluster={DOCUMENT_INDEX_NAME}"
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
resp = http_client.delete(delete_url)
|
|
||||||
resp.raise_for_status()
|
|
||||||
except httpx.HTTPStatusError:
|
|
||||||
logger.exception(
|
|
||||||
f"Selection-based delete failed for doc_id='{doc_id}'"
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Produce insertion records specifying which documents existed prior
|
|
||||||
return {
|
return {
|
||||||
DocumentInsertionRecord(
|
DocumentInsertionRecord(
|
||||||
document_id=doc_id,
|
document_id=doc_id,
|
||||||
already_existed=(doc_id in existing_doc_ids),
|
already_existed=doc_id in existing_docs,
|
||||||
)
|
)
|
||||||
for doc_id in doc_ids_to_current_index_time
|
for doc_id in all_doc_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import json
|
import json
|
||||||
import urllib.parse
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import List
|
|
||||||
from typing import Set
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from retry import retry
|
from retry import retry
|
||||||
@ -24,7 +21,6 @@ from onyx.document_index.vespa_constants import BOOST
|
|||||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||||
from onyx.document_index.vespa_constants import CONTENT
|
from onyx.document_index.vespa_constants import CONTENT
|
||||||
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
|
from onyx.document_index.vespa_constants import CONTENT_SUMMARY
|
||||||
from onyx.document_index.vespa_constants import CURRENT_INDEX_TIME
|
|
||||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||||
@ -36,7 +32,6 @@ from onyx.document_index.vespa_constants import METADATA_LIST
|
|||||||
from onyx.document_index.vespa_constants import METADATA_SUFFIX
|
from onyx.document_index.vespa_constants import METADATA_SUFFIX
|
||||||
from onyx.document_index.vespa_constants import NUM_THREADS
|
from onyx.document_index.vespa_constants import NUM_THREADS
|
||||||
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
|
from onyx.document_index.vespa_constants import PRIMARY_OWNERS
|
||||||
from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
|
|
||||||
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
|
from onyx.document_index.vespa_constants import SECONDARY_OWNERS
|
||||||
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
|
from onyx.document_index.vespa_constants import SECTION_CONTINUATION
|
||||||
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
|
from onyx.document_index.vespa_constants import SEMANTIC_IDENTIFIER
|
||||||
@ -173,7 +168,6 @@ def _index_vespa_chunk(
|
|||||||
METADATA_SUFFIX: chunk.metadata_suffix_keyword,
|
METADATA_SUFFIX: chunk.metadata_suffix_keyword,
|
||||||
EMBEDDINGS: embeddings_name_vector_map,
|
EMBEDDINGS: embeddings_name_vector_map,
|
||||||
TITLE_EMBEDDING: chunk.title_embedding,
|
TITLE_EMBEDDING: chunk.title_embedding,
|
||||||
CURRENT_INDEX_TIME: _vespa_get_updated_at_attribute(chunk.current_index_time),
|
|
||||||
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
||||||
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
||||||
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
|
SECONDARY_OWNERS: get_experts_stores_representations(document.secondary_owners),
|
||||||
@ -254,85 +248,3 @@ def clean_chunk_id_copy(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
return clean_chunk
|
return clean_chunk
|
||||||
|
|
||||||
|
|
||||||
def _does_doc_exist_in_vespa(
|
|
||||||
doc_id: str,
|
|
||||||
index_name: str,
|
|
||||||
http_client: httpx.Client,
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Checks whether there's a chunk/doc matching doc_id in Vespa using YQL.
|
|
||||||
"""
|
|
||||||
encoded_doc_id = urllib.parse.quote(doc_id)
|
|
||||||
|
|
||||||
# Construct the URL with YQL query
|
|
||||||
url = (
|
|
||||||
f"{SEARCH_ENDPOINT}"
|
|
||||||
f'?yql=select+*+from+sources+{index_name}+where+document_id+contains+"{encoded_doc_id}"'
|
|
||||||
"&hits=0"
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(f"Checking existence for doc_id={doc_id} with URL={url}")
|
|
||||||
resp = http_client.get(url)
|
|
||||||
|
|
||||||
if resp.status_code == 200:
|
|
||||||
data = resp.json()
|
|
||||||
try:
|
|
||||||
total_count = data["root"]["fields"]["totalCount"]
|
|
||||||
return total_count > 0
|
|
||||||
except (KeyError, TypeError):
|
|
||||||
logger.exception(f"Unexpected JSON structure from {url}: {data}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
elif resp.status_code == 404:
|
|
||||||
return False
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"Unexpected HTTP {resp.status_code} checking doc existence for doc_id={doc_id}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def find_existing_docs_in_vespa_by_doc_id(
|
|
||||||
doc_ids: List[str],
|
|
||||||
index_name: str,
|
|
||||||
http_client: httpx.Client,
|
|
||||||
executor: concurrent.futures.ThreadPoolExecutor | None = None,
|
|
||||||
) -> Set[str]:
|
|
||||||
"""
|
|
||||||
For each doc_id in doc_ids, returns whether it already exists in Vespa.
|
|
||||||
We do this concurrently for performance if doc_ids is large.
|
|
||||||
"""
|
|
||||||
if not doc_ids:
|
|
||||||
return set()
|
|
||||||
|
|
||||||
external_executor = True
|
|
||||||
if executor is None:
|
|
||||||
# Create our own if not given
|
|
||||||
external_executor = False
|
|
||||||
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)
|
|
||||||
|
|
||||||
existing_doc_ids = set()
|
|
||||||
|
|
||||||
try:
|
|
||||||
future_map = {
|
|
||||||
executor.submit(
|
|
||||||
_does_doc_exist_in_vespa, doc_id, index_name, http_client
|
|
||||||
): doc_id
|
|
||||||
for doc_id in doc_ids
|
|
||||||
}
|
|
||||||
for future in concurrent.futures.as_completed(future_map):
|
|
||||||
doc_id = future_map[future]
|
|
||||||
try:
|
|
||||||
if future.result():
|
|
||||||
existing_doc_ids.add(doc_id)
|
|
||||||
except Exception:
|
|
||||||
logger.exception(f"Error checking doc existence for doc_id={doc_id}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
finally:
|
|
||||||
if not external_executor:
|
|
||||||
executor.shutdown(wait=True)
|
|
||||||
return existing_doc_ids
|
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from urllib.parse import urlencode
|
|
||||||
|
|
||||||
from onyx.configs.constants import INDEX_SEPARATOR
|
from onyx.configs.constants import INDEX_SEPARATOR
|
||||||
from onyx.context.search.models import IndexFilters
|
from onyx.context.search.models import IndexFilters
|
||||||
from onyx.document_index.interfaces import VespaChunkRequest
|
from onyx.document_index.interfaces import VespaChunkRequest
|
||||||
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
|
||||||
from onyx.document_index.vespa_constants import CHUNK_ID
|
from onyx.document_index.vespa_constants import CHUNK_ID
|
||||||
from onyx.document_index.vespa_constants import CURRENT_INDEX_TIME
|
|
||||||
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||||
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
||||||
@ -108,24 +106,3 @@ def build_vespa_id_based_retrieval_yql(
|
|||||||
|
|
||||||
id_based_retrieval_yql_section += ")"
|
id_based_retrieval_yql_section += ")"
|
||||||
return id_based_retrieval_yql_section
|
return id_based_retrieval_yql_section
|
||||||
|
|
||||||
|
|
||||||
def build_deletion_selection_query(
|
|
||||||
doc_id: str, version_cutoff: int, doc_type: str
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Build a Vespa selection expression that includes:
|
|
||||||
- {doc_type}.document_id == <doc_id>
|
|
||||||
- {doc_type}.current_index_time < version_cutoff
|
|
||||||
|
|
||||||
Returns the URL-encoded selection query parameter.
|
|
||||||
"""
|
|
||||||
# Escape single quotes by doubling them for Vespa selection expressions
|
|
||||||
escaped_doc_id = doc_id.replace("'", "''")
|
|
||||||
|
|
||||||
filter_str = (
|
|
||||||
f"({doc_type}.document_id=='{escaped_doc_id}') and "
|
|
||||||
f"({doc_type}.{CURRENT_INDEX_TIME} < {version_cutoff})"
|
|
||||||
)
|
|
||||||
|
|
||||||
return urlencode({"selection": filter_str})
|
|
||||||
|
@ -52,7 +52,6 @@ BATCH_SIZE = 128 # Specific to Vespa
|
|||||||
|
|
||||||
TENANT_ID = "tenant_id"
|
TENANT_ID = "tenant_id"
|
||||||
DOCUMENT_ID = "document_id"
|
DOCUMENT_ID = "document_id"
|
||||||
CURRENT_INDEX_TIME = "current_index_time"
|
|
||||||
CHUNK_ID = "chunk_id"
|
CHUNK_ID = "chunk_id"
|
||||||
BLURB = "blurb"
|
BLURB = "blurb"
|
||||||
CONTENT = "content"
|
CONTENT = "content"
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import traceback
|
import traceback
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from datetime import datetime
|
|
||||||
from datetime import timezone
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
@ -402,8 +400,6 @@ def index_doc_batch(
|
|||||||
else DEFAULT_BOOST
|
else DEFAULT_BOOST
|
||||||
),
|
),
|
||||||
tenant_id=tenant_id,
|
tenant_id=tenant_id,
|
||||||
# Use a timezone-aware datetime, here we set to current UTC time
|
|
||||||
current_index_time=datetime.now(tz=timezone.utc),
|
|
||||||
)
|
)
|
||||||
for chunk in chunks_with_embeddings
|
for chunk in chunks_with_embeddings
|
||||||
]
|
]
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import datetime
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -74,14 +73,12 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
|||||||
of. This is used for filtering / personas.
|
of. This is used for filtering / personas.
|
||||||
boost: influences the ranking of this chunk at query time. Positive -> ranked higher,
|
boost: influences the ranking of this chunk at query time. Positive -> ranked higher,
|
||||||
negative -> ranked lower.
|
negative -> ranked lower.
|
||||||
current_index_time: the timestamp of when this chunk is being indexed.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tenant_id: str | None = None
|
tenant_id: str | None = None
|
||||||
access: "DocumentAccess"
|
access: "DocumentAccess"
|
||||||
document_sets: set[str]
|
document_sets: set[str]
|
||||||
boost: int
|
boost: int
|
||||||
current_index_time: datetime.datetime
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_index_chunk(
|
def from_index_chunk(
|
||||||
@ -91,7 +88,6 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
|||||||
document_sets: set[str],
|
document_sets: set[str],
|
||||||
boost: int,
|
boost: int,
|
||||||
tenant_id: str | None,
|
tenant_id: str | None,
|
||||||
current_index_time: datetime.datetime,
|
|
||||||
) -> "DocMetadataAwareIndexChunk":
|
) -> "DocMetadataAwareIndexChunk":
|
||||||
index_chunk_data = index_chunk.model_dump()
|
index_chunk_data = index_chunk.model_dump()
|
||||||
return cls(
|
return cls(
|
||||||
@ -100,7 +96,6 @@ class DocMetadataAwareIndexChunk(IndexChunk):
|
|||||||
document_sets=document_sets,
|
document_sets=document_sets,
|
||||||
boost=boost,
|
boost=boost,
|
||||||
tenant_id=tenant_id,
|
tenant_id=tenant_id,
|
||||||
current_index_time=current_index_time,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -86,7 +86,6 @@ def _create_indexable_chunks(
|
|||||||
access=default_public_access,
|
access=default_public_access,
|
||||||
document_sets=set(),
|
document_sets=set(),
|
||||||
boost=DEFAULT_BOOST,
|
boost=DEFAULT_BOOST,
|
||||||
current_index_time=datetime.datetime.now(datetime.timezone.utc),
|
|
||||||
)
|
)
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
|
|
||||||
@ -218,7 +217,7 @@ def seed_initial_documents(
|
|||||||
# as we just sent over the Vespa schema and there is a slight delay
|
# as we just sent over the Vespa schema and there is a slight delay
|
||||||
|
|
||||||
index_with_retries = retry_builder(tries=15)(document_index.index)
|
index_with_retries = retry_builder(tries=15)(document_index.index)
|
||||||
index_with_retries(chunks=chunks)
|
index_with_retries(chunks=chunks, fresh_index=cohere_enabled)
|
||||||
|
|
||||||
# Mock a run for the UI even though it did not actually call out to anything
|
# Mock a run for the UI even though it did not actually call out to anything
|
||||||
mock_successful_index_attempt(
|
mock_successful_index_attempt(
|
||||||
|
@ -10,7 +10,6 @@ Then run test_query_times.py to test query times.
|
|||||||
"""
|
"""
|
||||||
import random
|
import random
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
|
||||||
|
|
||||||
from onyx.access.models import DocumentAccess
|
from onyx.access.models import DocumentAccess
|
||||||
from onyx.configs.constants import DocumentSource
|
from onyx.configs.constants import DocumentSource
|
||||||
@ -97,7 +96,6 @@ def generate_dummy_chunk(
|
|||||||
document_sets={document_set for document_set in document_set_names},
|
document_sets={document_set for document_set in document_set_names},
|
||||||
boost=random.randint(-1, 1),
|
boost=random.randint(-1, 1),
|
||||||
tenant_id=POSTGRES_DEFAULT_SCHEMA,
|
tenant_id=POSTGRES_DEFAULT_SCHEMA,
|
||||||
current_index_time=datetime.now(tz=timezone.utc),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@ export const ChatFilters = forwardRef<HTMLDivElement, ChatFiltersProps>(
|
|||||||
const dedupedDocuments = removeDuplicateDocs(currentDocuments || []);
|
const dedupedDocuments = removeDuplicateDocs(currentDocuments || []);
|
||||||
|
|
||||||
const tokenLimitReached = selectedDocumentTokens > maxTokens - 75;
|
const tokenLimitReached = selectedDocumentTokens > maxTokens - 75;
|
||||||
|
console.log("SELECTED MESSAGE is", selectedMessage);
|
||||||
|
|
||||||
const hasSelectedDocuments = selectedDocumentIds.length > 0;
|
const hasSelectedDocuments = selectedDocumentIds.length > 0;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user