Require Semantic Identifier to not be None (#255)

This commit is contained in:
Yuhong Sun 2023-07-29 14:12:30 -07:00 committed by GitHub
parent 63780113d3
commit fe40e72b5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 14 additions and 3 deletions

View File

@ -4,9 +4,14 @@ from dataclasses import dataclass
from typing import Any
from typing import cast
from danswer.configs.constants import BLURB
from danswer.configs.constants import METADATA
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
from danswer.connectors.models import Document
from danswer.utils.logger import setup_logger
logger = setup_logger()
@dataclass
@ -58,4 +63,8 @@ class InferenceChunk(BaseChunk):
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
else:
init_kwargs[METADATA] = {}
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
logger.error(
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
)
return cls(**init_kwargs)

View File

@ -25,7 +25,7 @@ class Document:
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
sections: list[Section]
source: DocumentSource
semantic_identifier: str | None
semantic_identifier: str
metadata: dict[str, Any]

View File

@ -245,7 +245,7 @@ class WebConnector(LoadConnector):
id=current_url,
sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB,
semantic_identifier=title,
semantic_identifier=title or current_url,
metadata={},
)
)

View File

@ -33,7 +33,9 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
blurb=chunk.blurb,
source_type=chunk.source_type,
)
for chunk in chunks if chunk.semantic_identifier
# semantic identifier should always exist but for really old indices, it was not enforced
for chunk in chunks
if chunk.semantic_identifier
]
if chunks
else []