mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-23 12:31:30 +02:00
Require Semantic Identifier to not be None (#255)
This commit is contained in:
@@ -4,9 +4,14 @@ from dataclasses import dataclass
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
|
from danswer.configs.constants import BLURB
|
||||||
from danswer.configs.constants import METADATA
|
from danswer.configs.constants import METADATA
|
||||||
|
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||||
from danswer.configs.constants import SOURCE_LINKS
|
from danswer.configs.constants import SOURCE_LINKS
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -58,4 +63,8 @@ class InferenceChunk(BaseChunk):
|
|||||||
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
|
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
|
||||||
else:
|
else:
|
||||||
init_kwargs[METADATA] = {}
|
init_kwargs[METADATA] = {}
|
||||||
|
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
|
||||||
|
logger.error(
|
||||||
|
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||||
|
)
|
||||||
return cls(**init_kwargs)
|
return cls(**init_kwargs)
|
||||||
|
@@ -25,7 +25,7 @@ class Document:
|
|||||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||||
sections: list[Section]
|
sections: list[Section]
|
||||||
source: DocumentSource
|
source: DocumentSource
|
||||||
semantic_identifier: str | None
|
semantic_identifier: str
|
||||||
metadata: dict[str, Any]
|
metadata: dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
@@ -245,7 +245,7 @@ class WebConnector(LoadConnector):
|
|||||||
id=current_url,
|
id=current_url,
|
||||||
sections=[Section(link=current_url, text=page_text)],
|
sections=[Section(link=current_url, text=page_text)],
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=title,
|
semantic_identifier=title or current_url,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@@ -33,7 +33,9 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
|
|||||||
blurb=chunk.blurb,
|
blurb=chunk.blurb,
|
||||||
source_type=chunk.source_type,
|
source_type=chunk.source_type,
|
||||||
)
|
)
|
||||||
for chunk in chunks if chunk.semantic_identifier
|
# semantic identifier should always exist but for really old indices, it was not enforced
|
||||||
|
for chunk in chunks
|
||||||
|
if chunk.semantic_identifier
|
||||||
]
|
]
|
||||||
if chunks
|
if chunks
|
||||||
else []
|
else []
|
||||||
|
Reference in New Issue
Block a user