mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-12 13:56:12 +02:00
Reduce ranking scores for short chunks without actual information (#4098)
* remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering
This commit is contained in:
@@ -19,6 +19,9 @@ from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
||||
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
|
||||
from onyx.natural_language_processing.search_nlp_models import (
|
||||
InformationContentClassificationModel,
|
||||
)
|
||||
from onyx.server.onyx_api.models import DocMinimalInfo
|
||||
from onyx.server.onyx_api.models import IngestionDocument
|
||||
from onyx.server.onyx_api.models import IngestionResult
|
||||
@@ -102,8 +105,11 @@ def upsert_ingestion_doc(
|
||||
search_settings=search_settings
|
||||
)
|
||||
|
||||
information_content_classification_model = InformationContentClassificationModel()
|
||||
|
||||
indexing_pipeline = build_indexing_pipeline(
|
||||
embedder=index_embedding_model,
|
||||
information_content_classification_model=information_content_classification_model,
|
||||
document_index=curr_doc_index,
|
||||
ignore_time_skip=True,
|
||||
db_session=db_session,
|
||||
@@ -138,6 +144,7 @@ def upsert_ingestion_doc(
|
||||
|
||||
sec_ind_pipeline = build_indexing_pipeline(
|
||||
embedder=new_index_embedding_model,
|
||||
information_content_classification_model=information_content_classification_model,
|
||||
document_index=sec_doc_index,
|
||||
ignore_time_skip=True,
|
||||
db_session=db_session,
|
||||
|
Reference in New Issue
Block a user