mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 20:39:29 +02:00
Require Semantic Identifier to not be None (#255)
This commit is contained in:
parent
63780113d3
commit
fe40e72b5c
@ -4,9 +4,14 @@ from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -58,4 +63,8 @@ class InferenceChunk(BaseChunk):
|
||||
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
|
||||
else:
|
||||
init_kwargs[METADATA] = {}
|
||||
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
|
||||
logger.error(
|
||||
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||
)
|
||||
return cls(**init_kwargs)
|
||||
|
@ -25,7 +25,7 @@ class Document:
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
sections: list[Section]
|
||||
source: DocumentSource
|
||||
semantic_identifier: str | None
|
||||
semantic_identifier: str
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
|
@ -245,7 +245,7 @@ class WebConnector(LoadConnector):
|
||||
id=current_url,
|
||||
sections=[Section(link=current_url, text=page_text)],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=title,
|
||||
semantic_identifier=title or current_url,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
|
@ -33,7 +33,9 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
|
||||
blurb=chunk.blurb,
|
||||
source_type=chunk.source_type,
|
||||
)
|
||||
for chunk in chunks if chunk.semantic_identifier
|
||||
# semantic identifier should always exist but for really old indices, it was not enforced
|
||||
for chunk in chunks
|
||||
if chunk.semantic_identifier
|
||||
]
|
||||
if chunks
|
||||
else []
|
||||
|
Loading…
x
Reference in New Issue
Block a user