mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-17 21:32:36 +01:00
DAN-50 References should include blurb (#26)
This commit is contained in:
parent
38bcb3ee6b
commit
279c5e0eb1
@ -1,7 +1,9 @@
|
||||
import abc
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
|
||||
from danswer.chunking.models import IndexChunk
|
||||
from danswer.configs.app_configs import BLURB_LENGTH
|
||||
from danswer.configs.app_configs import CHUNK_OVERLAP
|
||||
from danswer.configs.app_configs import CHUNK_SIZE
|
||||
from danswer.connectors.models import Document
|
||||
@ -12,14 +14,44 @@ SECTION_SEPARATOR = "\n\n"
|
||||
ChunkFunc = Callable[[Document], list[IndexChunk]]
|
||||
|
||||
|
||||
def extract_blurb(text: str, blurb_len: int) -> str:
|
||||
if len(text) < blurb_len:
|
||||
return text
|
||||
|
||||
match = re.search(r"[.!?:]", text[blurb_len:])
|
||||
max_blub_len = min(2 * blurb_len, len(text))
|
||||
|
||||
end_index = (
|
||||
max_blub_len
|
||||
if match is None
|
||||
else min(blurb_len + match.start() + 1, max_blub_len)
|
||||
)
|
||||
|
||||
if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
|
||||
last_space = text.rfind(" ", 0, end_index)
|
||||
# If there's no space in the text (single word longer than blurb_len), return the whole text
|
||||
end_index = last_space if last_space != -1 else len(text)
|
||||
|
||||
blurb = text[:end_index]
|
||||
|
||||
blurb = blurb.replace("\n", " ")
|
||||
blurb = blurb.replace("\r", " ")
|
||||
while " " in blurb:
|
||||
blurb = blurb.replace(" ", " ")
|
||||
|
||||
return blurb
|
||||
|
||||
|
||||
def chunk_large_section(
|
||||
section: Section,
|
||||
document: Document,
|
||||
start_chunk_id: int,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
word_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_len: int = BLURB_LENGTH,
|
||||
) -> list[IndexChunk]:
|
||||
section_text = section.text
|
||||
blurb = extract_blurb(section_text, blurb_len)
|
||||
char_count = len(section_text)
|
||||
chunk_strs: list[str] = []
|
||||
start_pos = segment_start_pos = 0
|
||||
@ -61,6 +93,7 @@ def chunk_large_section(
|
||||
IndexChunk(
|
||||
source_document=document,
|
||||
chunk_id=start_chunk_id + chunk_ind,
|
||||
blurb=blurb,
|
||||
content=chunk_str,
|
||||
source_links={0: section.link},
|
||||
section_continuation=(chunk_ind != 0),
|
||||
@ -73,6 +106,7 @@ def chunk_document(
|
||||
document: Document,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
subsection_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_len=BLURB_LENGTH,
|
||||
) -> list[IndexChunk]:
|
||||
chunks: list[IndexChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
@ -90,6 +124,7 @@ def chunk_document(
|
||||
IndexChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
@ -104,6 +139,7 @@ def chunk_document(
|
||||
start_chunk_id=len(chunks),
|
||||
chunk_size=chunk_size,
|
||||
word_overlap=subsection_overlap,
|
||||
blurb_len=blurb_len,
|
||||
)
|
||||
chunks.extend(large_section_chunks)
|
||||
continue
|
||||
@ -119,6 +155,7 @@ def chunk_document(
|
||||
IndexChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
@ -133,6 +170,7 @@ def chunk_document(
|
||||
IndexChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
|
@ -8,6 +8,7 @@ from danswer.connectors.models import Document
|
||||
@dataclass
|
||||
class BaseChunk:
|
||||
chunk_id: int
|
||||
blurb: str # The first sentence(s) of the first Section of the chunk
|
||||
content: str
|
||||
source_links: Optional[
|
||||
dict[int, str]
|
||||
|
@ -7,6 +7,12 @@ APP_HOST = "0.0.0.0"
|
||||
APP_PORT = 8080
|
||||
|
||||
|
||||
#####
|
||||
# User Facing Features Configs
|
||||
#####
|
||||
BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters.
|
||||
|
||||
|
||||
#####
|
||||
# Vector DB Configs
|
||||
#####
|
||||
|
@ -2,6 +2,7 @@ from enum import Enum
|
||||
|
||||
DOCUMENT_ID = "document_id"
|
||||
CHUNK_ID = "chunk_id"
|
||||
BLURB = "blurb"
|
||||
CONTENT = "content"
|
||||
SOURCE_TYPE = "source_type"
|
||||
SOURCE_LINKS = "source_links"
|
||||
|
@ -121,7 +121,7 @@ class BatchGoogleDriveLoader(BatchLoader):
|
||||
doc_batch = []
|
||||
for file in files_batch:
|
||||
text_contents = extract_text(file, service)
|
||||
full_context = file["name"] + " " + text_contents
|
||||
full_context = file["name"] + " - " + text_contents
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
|
@ -3,6 +3,7 @@ import uuid
|
||||
from danswer.chunking.models import EmbeddedIndexChunk
|
||||
from danswer.configs.constants import ALLOWED_GROUPS
|
||||
from danswer.configs.constants import ALLOWED_USERS
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import CHUNK_ID
|
||||
from danswer.configs.constants import CONTENT
|
||||
from danswer.configs.constants import DOCUMENT_ID
|
||||
@ -58,6 +59,7 @@ def index_chunks(
|
||||
payload={
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
BLURB: chunk.blurb,
|
||||
CONTENT: chunk.content,
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: chunk.source_links,
|
||||
|
@ -12,6 +12,7 @@ import regex
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import OPENAI_API_KEY
|
||||
from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import DOCUMENT_ID
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINK
|
||||
@ -140,6 +141,7 @@ def match_quotes_to_docs(
|
||||
SOURCE_LINK: curr_link,
|
||||
SOURCE_TYPE: chunk.source_type,
|
||||
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
||||
BLURB: chunk.blurb,
|
||||
}
|
||||
break
|
||||
quotes_dict[quote] = {
|
||||
@ -147,6 +149,7 @@ def match_quotes_to_docs(
|
||||
SOURCE_LINK: curr_link,
|
||||
SOURCE_TYPE: chunk.source_type,
|
||||
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
||||
BLURB: chunk.blurb,
|
||||
}
|
||||
break
|
||||
return quotes_dict
|
||||
|
@ -4,6 +4,7 @@ import json
|
||||
import requests
|
||||
from danswer.configs.app_configs import APP_PORT
|
||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINK
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
@ -84,6 +85,7 @@ if __name__ == "__main__":
|
||||
):
|
||||
print(f"Quote {str(ind + 1)}:\n{quote}")
|
||||
print(f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}")
|
||||
print(f"Blurb: {quote_info[BLURB]}")
|
||||
print(f"Link: {quote_info[SOURCE_LINK]}")
|
||||
print(f"Source: {quote_info[SOURCE_TYPE]}")
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user