DAN-50 References should include blurb (#26)

This commit is contained in:
Yuhong Sun 2023-05-10 21:03:15 -07:00 committed by GitHub
parent 38bcb3ee6b
commit 279c5e0eb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 54 additions and 1 deletions

View File

@ -1,7 +1,9 @@
import abc
import re
from collections.abc import Callable
from danswer.chunking.models import IndexChunk
from danswer.configs.app_configs import BLURB_LENGTH
from danswer.configs.app_configs import CHUNK_OVERLAP
from danswer.configs.app_configs import CHUNK_SIZE
from danswer.connectors.models import Document
@ -12,14 +14,44 @@ SECTION_SEPARATOR = "\n\n"
ChunkFunc = Callable[[Document], list[IndexChunk]]
def extract_blurb(text: str, blurb_len: int) -> str:
if len(text) < blurb_len:
return text
match = re.search(r"[.!?:]", text[blurb_len:])
max_blub_len = min(2 * blurb_len, len(text))
end_index = (
max_blub_len
if match is None
else min(blurb_len + match.start() + 1, max_blub_len)
)
if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
last_space = text.rfind(" ", 0, end_index)
# If there's no space in the text (single word longer than blurb_len), return the whole text
end_index = last_space if last_space != -1 else len(text)
blurb = text[:end_index]
blurb = blurb.replace("\n", " ")
blurb = blurb.replace("\r", " ")
while " " in blurb:
blurb = blurb.replace(" ", " ")
return blurb
def chunk_large_section(
section: Section,
document: Document,
start_chunk_id: int,
chunk_size: int = CHUNK_SIZE,
word_overlap: int = CHUNK_OVERLAP,
blurb_len: int = BLURB_LENGTH,
) -> list[IndexChunk]:
section_text = section.text
blurb = extract_blurb(section_text, blurb_len)
char_count = len(section_text)
chunk_strs: list[str] = []
start_pos = segment_start_pos = 0
@ -61,6 +93,7 @@ def chunk_large_section(
IndexChunk(
source_document=document,
chunk_id=start_chunk_id + chunk_ind,
blurb=blurb,
content=chunk_str,
source_links={0: section.link},
section_continuation=(chunk_ind != 0),
@ -73,6 +106,7 @@ def chunk_document(
document: Document,
chunk_size: int = CHUNK_SIZE,
subsection_overlap: int = CHUNK_OVERLAP,
blurb_len=BLURB_LENGTH,
) -> list[IndexChunk]:
chunks: list[IndexChunk] = []
link_offsets: dict[int, str] = {}
@ -90,6 +124,7 @@ def chunk_document(
IndexChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
@ -104,6 +139,7 @@ def chunk_document(
start_chunk_id=len(chunks),
chunk_size=chunk_size,
word_overlap=subsection_overlap,
blurb_len=blurb_len,
)
chunks.extend(large_section_chunks)
continue
@ -119,6 +155,7 @@ def chunk_document(
IndexChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
@ -133,6 +170,7 @@ def chunk_document(
IndexChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,

View File

@ -8,6 +8,7 @@ from danswer.connectors.models import Document
@dataclass
class BaseChunk:
chunk_id: int
blurb: str # The first sentence(s) of the first Section of the chunk
content: str
source_links: Optional[
dict[int, str]

View File

@ -7,6 +7,12 @@ APP_HOST = "0.0.0.0"
APP_PORT = 8080
#####
# User Facing Features Configs
#####
BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters.
#####
# Vector DB Configs
#####

View File

@ -2,6 +2,7 @@ from enum import Enum
DOCUMENT_ID = "document_id"
CHUNK_ID = "chunk_id"
BLURB = "blurb"
CONTENT = "content"
SOURCE_TYPE = "source_type"
SOURCE_LINKS = "source_links"

View File

@ -121,7 +121,7 @@ class BatchGoogleDriveLoader(BatchLoader):
doc_batch = []
for file in files_batch:
text_contents = extract_text(file, service)
full_context = file["name"] + " " + text_contents
full_context = file["name"] + " - " + text_contents
doc_batch.append(
Document(

View File

@ -3,6 +3,7 @@ import uuid
from danswer.chunking.models import EmbeddedIndexChunk
from danswer.configs.constants import ALLOWED_GROUPS
from danswer.configs.constants import ALLOWED_USERS
from danswer.configs.constants import BLURB
from danswer.configs.constants import CHUNK_ID
from danswer.configs.constants import CONTENT
from danswer.configs.constants import DOCUMENT_ID
@ -58,6 +59,7 @@ def index_chunks(
payload={
DOCUMENT_ID: document.id,
CHUNK_ID: chunk.chunk_id,
BLURB: chunk.blurb,
CONTENT: chunk.content,
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: chunk.source_links,

View File

@ -12,6 +12,7 @@ import regex
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import OPENAI_API_KEY
from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
from danswer.configs.constants import BLURB
from danswer.configs.constants import DOCUMENT_ID
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINK
@ -140,6 +141,7 @@ def match_quotes_to_docs(
SOURCE_LINK: curr_link,
SOURCE_TYPE: chunk.source_type,
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
BLURB: chunk.blurb,
}
break
quotes_dict[quote] = {
@ -147,6 +149,7 @@ def match_quotes_to_docs(
SOURCE_LINK: curr_link,
SOURCE_TYPE: chunk.source_type,
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
BLURB: chunk.blurb,
}
break
return quotes_dict

View File

@ -4,6 +4,7 @@ import json
import requests
from danswer.configs.app_configs import APP_PORT
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
from danswer.configs.constants import BLURB
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINK
from danswer.configs.constants import SOURCE_TYPE
@ -84,6 +85,7 @@ if __name__ == "__main__":
):
print(f"Quote {str(ind + 1)}:\n{quote}")
print(f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}")
print(f"Blurb: {quote_info[BLURB]}")
print(f"Link: {quote_info[SOURCE_LINK]}")
print(f"Source: {quote_info[SOURCE_TYPE]}")
else: