mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 12:29:41 +02:00
DAN-50 References should include blurb (#26)
This commit is contained in:
@@ -1,7 +1,9 @@
|
|||||||
import abc
|
import abc
|
||||||
|
import re
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
|
||||||
from danswer.chunking.models import IndexChunk
|
from danswer.chunking.models import IndexChunk
|
||||||
|
from danswer.configs.app_configs import BLURB_LENGTH
|
||||||
from danswer.configs.app_configs import CHUNK_OVERLAP
|
from danswer.configs.app_configs import CHUNK_OVERLAP
|
||||||
from danswer.configs.app_configs import CHUNK_SIZE
|
from danswer.configs.app_configs import CHUNK_SIZE
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
@@ -12,14 +14,44 @@ SECTION_SEPARATOR = "\n\n"
|
|||||||
ChunkFunc = Callable[[Document], list[IndexChunk]]
|
ChunkFunc = Callable[[Document], list[IndexChunk]]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_blurb(text: str, blurb_len: int) -> str:
|
||||||
|
if len(text) < blurb_len:
|
||||||
|
return text
|
||||||
|
|
||||||
|
match = re.search(r"[.!?:]", text[blurb_len:])
|
||||||
|
max_blub_len = min(2 * blurb_len, len(text))
|
||||||
|
|
||||||
|
end_index = (
|
||||||
|
max_blub_len
|
||||||
|
if match is None
|
||||||
|
else min(blurb_len + match.start() + 1, max_blub_len)
|
||||||
|
)
|
||||||
|
|
||||||
|
if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
|
||||||
|
last_space = text.rfind(" ", 0, end_index)
|
||||||
|
# If there's no space in the text (single word longer than blurb_len), return the whole text
|
||||||
|
end_index = last_space if last_space != -1 else len(text)
|
||||||
|
|
||||||
|
blurb = text[:end_index]
|
||||||
|
|
||||||
|
blurb = blurb.replace("\n", " ")
|
||||||
|
blurb = blurb.replace("\r", " ")
|
||||||
|
while " " in blurb:
|
||||||
|
blurb = blurb.replace(" ", " ")
|
||||||
|
|
||||||
|
return blurb
|
||||||
|
|
||||||
|
|
||||||
def chunk_large_section(
|
def chunk_large_section(
|
||||||
section: Section,
|
section: Section,
|
||||||
document: Document,
|
document: Document,
|
||||||
start_chunk_id: int,
|
start_chunk_id: int,
|
||||||
chunk_size: int = CHUNK_SIZE,
|
chunk_size: int = CHUNK_SIZE,
|
||||||
word_overlap: int = CHUNK_OVERLAP,
|
word_overlap: int = CHUNK_OVERLAP,
|
||||||
|
blurb_len: int = BLURB_LENGTH,
|
||||||
) -> list[IndexChunk]:
|
) -> list[IndexChunk]:
|
||||||
section_text = section.text
|
section_text = section.text
|
||||||
|
blurb = extract_blurb(section_text, blurb_len)
|
||||||
char_count = len(section_text)
|
char_count = len(section_text)
|
||||||
chunk_strs: list[str] = []
|
chunk_strs: list[str] = []
|
||||||
start_pos = segment_start_pos = 0
|
start_pos = segment_start_pos = 0
|
||||||
@@ -61,6 +93,7 @@ def chunk_large_section(
|
|||||||
IndexChunk(
|
IndexChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=start_chunk_id + chunk_ind,
|
chunk_id=start_chunk_id + chunk_ind,
|
||||||
|
blurb=blurb,
|
||||||
content=chunk_str,
|
content=chunk_str,
|
||||||
source_links={0: section.link},
|
source_links={0: section.link},
|
||||||
section_continuation=(chunk_ind != 0),
|
section_continuation=(chunk_ind != 0),
|
||||||
@@ -73,6 +106,7 @@ def chunk_document(
|
|||||||
document: Document,
|
document: Document,
|
||||||
chunk_size: int = CHUNK_SIZE,
|
chunk_size: int = CHUNK_SIZE,
|
||||||
subsection_overlap: int = CHUNK_OVERLAP,
|
subsection_overlap: int = CHUNK_OVERLAP,
|
||||||
|
blurb_len=BLURB_LENGTH,
|
||||||
) -> list[IndexChunk]:
|
) -> list[IndexChunk]:
|
||||||
chunks: list[IndexChunk] = []
|
chunks: list[IndexChunk] = []
|
||||||
link_offsets: dict[int, str] = {}
|
link_offsets: dict[int, str] = {}
|
||||||
@@ -90,6 +124,7 @@ def chunk_document(
|
|||||||
IndexChunk(
|
IndexChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
|
blurb=extract_blurb(chunk_text, blurb_len),
|
||||||
content=chunk_text,
|
content=chunk_text,
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
@@ -104,6 +139,7 @@ def chunk_document(
|
|||||||
start_chunk_id=len(chunks),
|
start_chunk_id=len(chunks),
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
word_overlap=subsection_overlap,
|
word_overlap=subsection_overlap,
|
||||||
|
blurb_len=blurb_len,
|
||||||
)
|
)
|
||||||
chunks.extend(large_section_chunks)
|
chunks.extend(large_section_chunks)
|
||||||
continue
|
continue
|
||||||
@@ -119,6 +155,7 @@ def chunk_document(
|
|||||||
IndexChunk(
|
IndexChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
|
blurb=extract_blurb(chunk_text, blurb_len),
|
||||||
content=chunk_text,
|
content=chunk_text,
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
@@ -133,6 +170,7 @@ def chunk_document(
|
|||||||
IndexChunk(
|
IndexChunk(
|
||||||
source_document=document,
|
source_document=document,
|
||||||
chunk_id=len(chunks),
|
chunk_id=len(chunks),
|
||||||
|
blurb=extract_blurb(chunk_text, blurb_len),
|
||||||
content=chunk_text,
|
content=chunk_text,
|
||||||
source_links=link_offsets,
|
source_links=link_offsets,
|
||||||
section_continuation=False,
|
section_continuation=False,
|
||||||
|
@@ -8,6 +8,7 @@ from danswer.connectors.models import Document
|
|||||||
@dataclass
|
@dataclass
|
||||||
class BaseChunk:
|
class BaseChunk:
|
||||||
chunk_id: int
|
chunk_id: int
|
||||||
|
blurb: str # The first sentence(s) of the first Section of the chunk
|
||||||
content: str
|
content: str
|
||||||
source_links: Optional[
|
source_links: Optional[
|
||||||
dict[int, str]
|
dict[int, str]
|
||||||
|
@@ -7,6 +7,12 @@ APP_HOST = "0.0.0.0"
|
|||||||
APP_PORT = 8080
|
APP_PORT = 8080
|
||||||
|
|
||||||
|
|
||||||
|
#####
|
||||||
|
# User Facing Features Configs
|
||||||
|
#####
|
||||||
|
BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters.
|
||||||
|
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Vector DB Configs
|
# Vector DB Configs
|
||||||
#####
|
#####
|
||||||
|
@@ -2,6 +2,7 @@ from enum import Enum
|
|||||||
|
|
||||||
DOCUMENT_ID = "document_id"
|
DOCUMENT_ID = "document_id"
|
||||||
CHUNK_ID = "chunk_id"
|
CHUNK_ID = "chunk_id"
|
||||||
|
BLURB = "blurb"
|
||||||
CONTENT = "content"
|
CONTENT = "content"
|
||||||
SOURCE_TYPE = "source_type"
|
SOURCE_TYPE = "source_type"
|
||||||
SOURCE_LINKS = "source_links"
|
SOURCE_LINKS = "source_links"
|
||||||
|
@@ -121,7 +121,7 @@ class BatchGoogleDriveLoader(BatchLoader):
|
|||||||
doc_batch = []
|
doc_batch = []
|
||||||
for file in files_batch:
|
for file in files_batch:
|
||||||
text_contents = extract_text(file, service)
|
text_contents = extract_text(file, service)
|
||||||
full_context = file["name"] + " " + text_contents
|
full_context = file["name"] + " - " + text_contents
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
@@ -3,6 +3,7 @@ import uuid
|
|||||||
from danswer.chunking.models import EmbeddedIndexChunk
|
from danswer.chunking.models import EmbeddedIndexChunk
|
||||||
from danswer.configs.constants import ALLOWED_GROUPS
|
from danswer.configs.constants import ALLOWED_GROUPS
|
||||||
from danswer.configs.constants import ALLOWED_USERS
|
from danswer.configs.constants import ALLOWED_USERS
|
||||||
|
from danswer.configs.constants import BLURB
|
||||||
from danswer.configs.constants import CHUNK_ID
|
from danswer.configs.constants import CHUNK_ID
|
||||||
from danswer.configs.constants import CONTENT
|
from danswer.configs.constants import CONTENT
|
||||||
from danswer.configs.constants import DOCUMENT_ID
|
from danswer.configs.constants import DOCUMENT_ID
|
||||||
@@ -58,6 +59,7 @@ def index_chunks(
|
|||||||
payload={
|
payload={
|
||||||
DOCUMENT_ID: document.id,
|
DOCUMENT_ID: document.id,
|
||||||
CHUNK_ID: chunk.chunk_id,
|
CHUNK_ID: chunk.chunk_id,
|
||||||
|
BLURB: chunk.blurb,
|
||||||
CONTENT: chunk.content,
|
CONTENT: chunk.content,
|
||||||
SOURCE_TYPE: str(document.source.value),
|
SOURCE_TYPE: str(document.source.value),
|
||||||
SOURCE_LINKS: chunk.source_links,
|
SOURCE_LINKS: chunk.source_links,
|
||||||
|
@@ -12,6 +12,7 @@ import regex
|
|||||||
from danswer.chunking.models import InferenceChunk
|
from danswer.chunking.models import InferenceChunk
|
||||||
from danswer.configs.app_configs import OPENAI_API_KEY
|
from danswer.configs.app_configs import OPENAI_API_KEY
|
||||||
from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
|
from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
|
||||||
|
from danswer.configs.constants import BLURB
|
||||||
from danswer.configs.constants import DOCUMENT_ID
|
from danswer.configs.constants import DOCUMENT_ID
|
||||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||||
from danswer.configs.constants import SOURCE_LINK
|
from danswer.configs.constants import SOURCE_LINK
|
||||||
@@ -140,6 +141,7 @@ def match_quotes_to_docs(
|
|||||||
SOURCE_LINK: curr_link,
|
SOURCE_LINK: curr_link,
|
||||||
SOURCE_TYPE: chunk.source_type,
|
SOURCE_TYPE: chunk.source_type,
|
||||||
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
||||||
|
BLURB: chunk.blurb,
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
quotes_dict[quote] = {
|
quotes_dict[quote] = {
|
||||||
@@ -147,6 +149,7 @@ def match_quotes_to_docs(
|
|||||||
SOURCE_LINK: curr_link,
|
SOURCE_LINK: curr_link,
|
||||||
SOURCE_TYPE: chunk.source_type,
|
SOURCE_TYPE: chunk.source_type,
|
||||||
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
|
||||||
|
BLURB: chunk.blurb,
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
return quotes_dict
|
return quotes_dict
|
||||||
|
@@ -4,6 +4,7 @@ import json
|
|||||||
import requests
|
import requests
|
||||||
from danswer.configs.app_configs import APP_PORT
|
from danswer.configs.app_configs import APP_PORT
|
||||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||||
|
from danswer.configs.constants import BLURB
|
||||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||||
from danswer.configs.constants import SOURCE_LINK
|
from danswer.configs.constants import SOURCE_LINK
|
||||||
from danswer.configs.constants import SOURCE_TYPE
|
from danswer.configs.constants import SOURCE_TYPE
|
||||||
@@ -84,6 +85,7 @@ if __name__ == "__main__":
|
|||||||
):
|
):
|
||||||
print(f"Quote {str(ind + 1)}:\n{quote}")
|
print(f"Quote {str(ind + 1)}:\n{quote}")
|
||||||
print(f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}")
|
print(f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}")
|
||||||
|
print(f"Blurb: {quote_info[BLURB]}")
|
||||||
print(f"Link: {quote_info[SOURCE_LINK]}")
|
print(f"Link: {quote_info[SOURCE_LINK]}")
|
||||||
print(f"Source: {quote_info[SOURCE_TYPE]}")
|
print(f"Source: {quote_info[SOURCE_TYPE]}")
|
||||||
else:
|
else:
|
||||||
|
Reference in New Issue
Block a user