Reduce ranking scores for short chunks without actual information (#4098)

* remove title for slack * initial working code * simplification * improvements * name change to information_content_model * avoid boost_score > 1.0 * nit * EL comments and improvements Improvements: - proper import of information content model from cache or HF - warm up for information content model Other: - EL PR review comments * nit * requirements version update * fixed docker file * new home for model_server configs * default off * small updates * YS comments - pt 1 * renaming to chunk_boost & chunk table def * saving and deleting chunk stats in new table * saving and updating chunk stats * improved dict score update * create columns for individual boost factors * RK comments * Update migration * manual import reordering
2025-09-12 13:56:12 +02:00 · 2025-03-13 10:35:45 -07:00
parent ba82888e1e
commit 463340b8a1
31 changed files with 898 additions and 34 deletions
--- a/backend/onyx/server/onyx_api/ingestion.py
+++ b/backend/onyx/server/onyx_api/ingestion.py
@@ -19,6 +19,9 @@ from onyx.db.search_settings import get_secondary_search_settings
 from onyx.document_index.factory import get_default_document_index
 from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_pipeline import build_indexing_pipeline
+from onyx.natural_language_processing.search_nlp_models import (
+    InformationContentClassificationModel,
+)
 from onyx.server.onyx_api.models import DocMinimalInfo
 from onyx.server.onyx_api.models import IngestionDocument
 from onyx.server.onyx_api.models import IngestionResult
@@ -102,8 +105,11 @@ def upsert_ingestion_doc(
        search_settings=search_settings
    )

+    information_content_classification_model = InformationContentClassificationModel()
+
    indexing_pipeline = build_indexing_pipeline(
        embedder=index_embedding_model,
+        information_content_classification_model=information_content_classification_model,
        document_index=curr_doc_index,
        ignore_time_skip=True,
        db_session=db_session,
@@ -138,6 +144,7 @@ def upsert_ingestion_doc(

        sec_ind_pipeline = build_indexing_pipeline(
            embedder=new_index_embedding_model,
+            information_content_classification_model=information_content_classification_model,
            document_index=sec_doc_index,
            ignore_time_skip=True,
            db_session=db_session,