Require Semantic Identifier to not be None (#255)

This commit is contained in:
Yuhong Sun
2023-07-29 14:12:30 -07:00
committed by GitHub
parent 63780113d3
commit fe40e72b5c
4 changed files with 14 additions and 3 deletions

View File

@@ -4,9 +4,14 @@ from dataclasses import dataclass
from typing import Any from typing import Any
from typing import cast from typing import cast
from danswer.configs.constants import BLURB
from danswer.configs.constants import METADATA from danswer.configs.constants import METADATA
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_LINKS
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.utils.logger import setup_logger
logger = setup_logger()
@dataclass @dataclass
@@ -58,4 +63,8 @@ class InferenceChunk(BaseChunk):
init_kwargs[METADATA] = json.loads(init_kwargs[METADATA]) init_kwargs[METADATA] = json.loads(init_kwargs[METADATA])
else: else:
init_kwargs[METADATA] = {} init_kwargs[METADATA] = {}
if init_kwargs.get(SEMANTIC_IDENTIFIER) is None:
logger.error(
f"Chunk with blurb: {init_kwargs.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
)
return cls(**init_kwargs) return cls(**init_kwargs)

View File

@@ -25,7 +25,7 @@ class Document:
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
sections: list[Section] sections: list[Section]
source: DocumentSource source: DocumentSource
semantic_identifier: str | None semantic_identifier: str
metadata: dict[str, Any] metadata: dict[str, Any]

View File

@@ -245,7 +245,7 @@ class WebConnector(LoadConnector):
id=current_url, id=current_url,
sections=[Section(link=current_url, text=page_text)], sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=title, semantic_identifier=title or current_url,
metadata={}, metadata={},
) )
) )

View File

@@ -33,7 +33,9 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
blurb=chunk.blurb, blurb=chunk.blurb,
source_type=chunk.source_type, source_type=chunk.source_type,
) )
for chunk in chunks if chunk.semantic_identifier # semantic identifier should always exist but for really old indices, it was not enforced
for chunk in chunks
if chunk.semantic_identifier
] ]
if chunks if chunks
else [] else []